Added pages config by token matching, improved meta tags parsing

2024-08-30 18:12:12 +00:00 · 2020-05-17 13:11:06 +01:00 · 2020-05-17 13:11:06 +01:00 · 74a16b8a17
commit 74a16b8a17
parent a70c8b4708
3 changed files with 88 additions and 57 deletions
--- a/.gitignore
+++ b/.gitignore
@ -112,6 +112,7 @@ dmypy.json
 .vscode
 env
 dist/*
+test/*
 debug.log
 webdrive.log
 *.bat
--- a/loconotion.py
+++ b/loconotion.py
@ -34,18 +34,6 @@ def setup_logger(name):

 log = setup_logger("loconotion-logger")

-def get_clean_slug(url, extension = True):
-  path = urllib.parse.urlparse(url).path.replace('/', '')
-  if ("-" in path and len(path.split("-")) > 1):
-    # a standard notion page looks like the-page-title-[uiid]
-    # strip the uuid and keep the page title only
-    path = "-".join(path.split("-")[:-1]).lower()
-  elif ("?" in path):
-    # database pages just have an uiid and a query param
-    # not much to do here, just get rid of the query param
-    path = path.split("?")[0].lower()
-  return path + (".html" if extension else "")
-
 def download_file(url, destination):
  if not Path(destination).is_file():
    # Disabling proxy speeds up requests time
@ -123,14 +111,14 @@ class Parser():
  def __init__(self, config = {}):
    url = config.get("page", None)
    if not url:
-      log.error("No url specified")
+      log.critical("No url specified")
      return

    self.driver = self.init_chromedriver()
    self.config = config

    # get the site name from the config, or make it up by cleaning the target page's slug
-    site_name = self.config.get("name", get_clean_slug(url, extension = False))
+    site_name = self.config.get("name", self.get_page_slug(url, extension = False))

    # set the output folder based on the site name, and create it if necessary
    self.dist_folder = Path(config.get("output", Path("dist") / site_name))
@ -139,6 +127,42 @@ class Parser():

    self.run(url)

+  def get_page_config(self, token):
+    # starts by grabbing the gobal site configuration table, if exists
+    site_config = self.config.get("site", {})
+    # find a table in the configuration file whose key contains the passed token string
+    matching_page_config = [value for key, value in self.config.items() if key.lower() in token]
+    if (matching_page_config):
+      if (len(matching_page_config) > 1):
+        log.error(f"multiple matching page config tokens found for {token} in configuration file. Make sure pages urls / slugs are unique")
+        return site_config
+      else:
+        # if found, merge it on top of the global site configuration table
+        # log.debug(f"Config table found for page with token {token}")
+        return {**site_config, **matching_page_config[0]}
+    else:
+      # log.debug(f"No config table found for page token {token}, using global site config table")
+      return site_config
+
+  def get_page_slug(self, url, extension = True):
+    # first check if the url has a custom slug configured in the config file
+    custom_slug = self.get_page_config(url).get("slug", None)
+    if custom_slug:
+      log.debug(f"Custom slug found for url {url}: {custom_slug}")
+      return custom_slug.replace('/', '') + (".html" if extension else "")
+    else:
+      # if not, clean up the existing slug
+      path = urllib.parse.urlparse(url).path.replace('/', '')
+      if ("-" in path and len(path.split("-")) > 1):
+        # a standard notion page looks like the-page-title-[uiid]
+        # strip the uuid and keep the page title only
+        path = "-".join(path.split("-")[:-1]).lower()
+      elif ("?" in path):
+        # database pages just have an uiid and a query param
+        # not much to do here, just get rid of the query param
+        path = path.split("?")[0].lower()
+      return path + (".html" if extension else "")
+
  def init_chromedriver(self):
    log.info("Initialising chrome driver")
    chrome_options = Options()  
@ -160,6 +184,7 @@ class Parser():
      index = url;

    log.info(f'Parsing page {url}')
+    log.debug(f'Using page config: {self.get_page_config(url)}')
    self.driver.get(url)

    # if ("This content does not exist" in self.driver.page_source):
@ -170,7 +195,7 @@ class Parser():
      # WebDriverWait(self.driver, 10).until(notion_page_loaded())
      WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'notion-presence-container')))
    except TimeoutException as ex:
-      log.error("Timeout waiting for page content to load, or no content found. Are you sure the page is set to public?")
+      log.critical("Timeout waiting for page content to load, or no content found. Are you sure the page is set to public?")
      return

    time.sleep(2)
@ -192,7 +217,7 @@ class Parser():
            try:
              WebDriverWait(self.driver, 10).until(toggle_block_has_opened(toggle_block))
            except TimeoutException as ex:
-              log.warn("Timeout waiting for toggle block to open")   
+              log.warning("Timeout waiting for toggle block to open")   
            opened_toggles.append(toggle_block) 
      # after all toggles have been opened, check the page again to see if
      # any toggle block had nested toggle blocks inside them
@ -204,7 +229,7 @@ class Parser():
    open_toggle_blocks()

    # creates soup from the page to start parsing
-    soup = BeautifulSoup(self.driver.page_source)
+    soup = BeautifulSoup(self.driver.page_source, "html.parser")

    # process eventual embedded iframes
    for embed in soup.select('div[embed-ghost]'):
@ -213,28 +238,22 @@ class Parser():
      iframe_parent['class'] = iframe_parent.get('class', []) + ['loconotion-iframe-target']
      iframe_parent['loconotion-iframe-src'] = iframe['src']

-    # process meta tags
-    def set_meta_tag(prop_name, prop_value, content):
-      log.debug(f"Setting meta tag {prop_value} to '{content}'")
-      tag = soup.find("meta", attrs = { prop_name : prop_value})
-      if (tag):
-        if (content): tag["content"] = content
-        else: tag.decompose();
-      else:
-        tag = soup.new_tag('meta')
-        tag.attrs[prop_name] = prop_value
-        tag.attrs['content'] = content
-        soup.head.append(tag)
-
    # clean up the default notion meta tags
    for tag in ["description", "twitter:card", "twitter:site", "twitter:title", "twitter:description", "twitter:image", "twitter:url", "apple-itunes-app"]:
-      set_meta_tag("name", tag, None)
+      unwanted_tag = soup.find("meta", attrs = { "name" : tag})
+      if (unwanted_tag): unwanted_tag.decompose();
    for tag in ["og:site_name", "og:type", "og:url", "og:title", "og:description", "og:image"]:
-      set_meta_tag("property", tag, None)
+      unwanted_og_tag = soup.find("meta", attrs = { "property" : tag})
+      if (unwanted_og_tag): unwanted_og_tag.decompose();

    # set custom meta tags
-    for name, content in self.config.get("meta", {}).items():
-      set_meta_tag("name", name, content)
+    custom_meta_tags = self.get_page_config(url).get("meta", [])
+    for custom_meta_tag in custom_meta_tags:
+      tag = soup.new_tag('meta')
+      for attr, value in custom_meta_tag.items():
+        tag.attrs[attr] = value
+      log.debug(f"Adding meta tag {str(tag)}")
+      soup.head.append(tag)

    # process images
    cache_images = True
@ -300,7 +319,7 @@ class Parser():
        toggle_content['class'] = toggle_content.get('class', []) + ['loconotion-toggle-content']
        toggle_content.attrs['loconotion-toggle-id'] = toggle_button.attrs['loconotion-toggle-id'] = toggle_id

-    # embed custom google font
+    # embed custom google font(s)
    fonts_selectors = {
      "site" : "div:not(.notion-code-block)",
      "navbar": ".notion-topbar div",
@ -312,7 +331,7 @@ class Parser():
      "code" : ".notion-code-block *"
    }

-    custom_fonts = self.config.get("fonts", {})
+    custom_fonts = self.get_page_config(url).get("fonts", {})
    if (custom_fonts):
      # append a stylesheet importing the google font for each unique font
      unique_custom_fonts = set(custom_fonts.values())
@ -348,12 +367,12 @@ class Parser():
      if a['href'].startswith('/'):
        sub_page_href = 'https://www.notion.so' + a['href']
        sub_pages.append(sub_page_href)
-        a['href'] = get_clean_slug(sub_page_href) if sub_page_href != index else "index.html"
+        a['href'] = self.get_page_slug(sub_page_href) if sub_page_href != index else "index.html"
        log.debug(f"Found link to page {a['href']}")

    # exports the parsed page
    html_str = str(soup)
-    html_file = get_clean_slug(url) if url != index else "index.html"
+    html_file = self.get_page_slug(url) if url != index else "index.html"
    log.info(f"Exporting page {url} as {html_file}")
    with open(self.dist_folder / html_file, "wb") as f:
      f.write(html_str.encode('utf-8').strip())
@ -385,9 +404,9 @@ if __name__ == '__main__':
          log.info("Initialising parser with simple page url")
          Parser({ "page" : args.target })
        else:
-          log.error(f"{args.target} is not a notion.so page")
+          log.critical(f"{args.target} is not a notion.so page")
      except requests.ConnectionError as exception:
-        log.error(f"{args.target} does not seem to be an existing web page")
+        log.critical(f"{args.target} does not seem to be an existing web page")
    else:
      if Path(args.target).is_file():
        with open(args.target) as f:
@ -395,9 +414,9 @@ if __name__ == '__main__':
          log.info("Initialising parser with configuration file")
          Parser(parsed_config)
      else:
-        log.error(f"Config file {args.target} does not exists")
+        log.critical(f"Config file {args.target} does not exists")
  except KeyboardInterrupt:
-    log.error('Interrupted by user')
+    log.critical('Interrupted by user')
    try:
      sys.exit(0)
    except SystemExit:
--- a/test_site.toml
+++ b/test_site.toml
@ -1,20 +1,31 @@
 name = "Notion Test Site"
 page = "https://www.notion.so/A-Notion-Page-03c403f4fdc94cc1b315b9469a8950ef"

-[meta]
-title = "Loconotion Test Site"
-description = "A static site generated from a Notion.so page using Loconotion"
+[site]
+  [[site.meta]]
+  name = "title"
+  content = "Loconotion Test Site"

-[fonts]
-site = 'Roboto'
-navbar = ''
-title = ''
-h1 = 'Nunito'
-h2 = 'Nunito'
-h3 = 'Nunito'
-body = ''
-code = ''
+  [[site.meta]]
+  name = "description"
+  content = "A static site generated from a Notion.so page using Loconotion"

-[custom]
-js = []
-css = []
+  [site.fonts]
+  site = 'Roboto'
+  navbar = ''
+  title = ''
+  h1 = 'Nunito'
+  h2 = 'Nunito'
+  h3 = 'Nunito'
+  body = ''
+  code = ''
+
+[d2fa06f244e64f66880bb0491f58223d] #list page
+  slug = "list"
+
+  [[d2fa06f244e64f66880bb0491f58223d.meta]]
+  name = "description"
+  content = "A fullscreen list database page"
+
+[d2fa06f244e64f66880bb] #list page
+  slug = "list"