Initial project commit

Working version with no config exposed
2024-08-30 18:12:12 +00:00 · 2020-05-16 18:24:40 +01:00
parent 7fd64af08a
commit b0a0baf9fb
5 changed files with 596 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,117 @@
+
+# Created by https://www.gitignore.io/api/python
+# Edit at https://www.gitignore.io/?templates=python
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# End of https://www.gitignore.io/api/python
+
+.vscode
+env
+dist/*
+debug.log
+webdrive.log
+*.bat
--- a/bin/chromedriver.exe
+++ b/bin/chromedriver.exe
--- a/loconotion.css
+++ b/loconotion.css
@ -0,0 +1,62 @@
+/* enables hover effect on buttons */
+div[role="button"]:not(.notion-record-icon):hover {
+  background: rgba(55, 53, 47, 0.08);
+}
+
+/* hides loading spinner */
+.loading-spinner {
+  display: none !important;
+}
+
+/* hides elements on the top right (search box, duplicate, notion shortcut) */
+.notion-topbar > div > div:not(:first-child) {
+  display: none !important;
+}
+
+@media only screen and (max-width: 960px) {
+  /* normalizes banner width */
+  .notion-scroller > div > div:not([class]) {
+    width: 100% !important;
+    max-width: 900px !important;
+    padding-left: 0 !important;
+    padding-right: 0 !important;
+  }
+
+  /* normalizes content width */
+  .notion-page-content {
+    width: 100% !important;
+    max-width: unset !important;
+    padding-right: 0 !important;
+    padding-left: 0 !important;
+  }
+
+  /* normalizes database views width */
+  .notion-list-view,
+  .notion-gallery-view,
+  .notion-table-view,
+  .notion-board-view,
+  .notion-calendar-view {
+    padding-left: 0 !important;
+    padding-right: 0 !important;
+  }
+
+  /* add padding to banner, but not to image */
+  .notion-scroller > div:first-child > div:last-child {
+    padding-right: 2em !important;
+    padding-left: 2em !important;
+  }
+
+  /* add padding to content */
+  .notion-scroller > div:nth-child(2) {
+    padding-right: 2em !important;
+    padding-left: 2em !important;
+  }
+
+  /* collapses flex rows into columns */
+  .notion-column_list-block > div {
+    flex-direction: column;
+  }
+  .notion-column_list-block > div > * {
+    width: unset !important;
+  }
+}
--- a/loconotion.js
+++ b/loconotion.js
@ -0,0 +1,61 @@
+const showToggle = (content, arrow) => {
+  arrow.style.transform = "rotateZ(180deg)";
+  content.style.display = "block";
+};
+
+const hideToggle = (content, arrow) => {
+  arrow.style.transform = "rotateZ(90deg)";
+  content.style.display = "none";
+};
+
+const toggleButtons = document.getElementsByClassName("loconotion-toggle-button");
+for (let i = 0; i < toggleButtons.length; i++) {
+  const toggleButton = toggleButtons.item(i);
+  const toggleId = toggleButton.getAttribute("loconotion-toggle-id");
+  const toggleContent = document.querySelector(`.loconotion-toggle-content[loconotion-toggle-id='${toggleId}']`);
+  const toggleArrow = toggleButton.querySelector("svg");
+  if (toggleButton && toggleContent) {
+    hideToggle(toggleContent, toggleArrow);
+    toggleButton.addEventListener("click", () => {
+      if (toggleContent.style.display == "none") {
+        showToggle(toggleContent, toggleArrow);
+      } else {
+        hideToggle(toggleContent, toggleArrow);
+      }
+    });
+  }
+}
+
+const pendingIframes = document.getElementsByClassName("loconotion-iframe-target");
+for (let i = 0; i < pendingIframes.length; i++) {
+  const pendingIframe = pendingIframes.item(i);
+  const iframeSrc = pendingIframe.getAttribute("loconotion-iframe-src");
+  const iframe = document.createElement("iframe");
+
+  pendingIframe.style.opacity = 0;
+  iframe.onload = () => {
+    pendingIframe.style.opacity = 1;
+  };
+
+  iframe.style.width = "100%";
+  iframe.style.height = "100%";
+  iframe.style.position = "absolute";
+  iframe.style.left = 0;
+  iframe.style.top = 0;
+  iframe.style.pointerEvents = "auto";
+
+  iframe.setAttribute("src", iframeSrc);
+  iframe.setAttribute("frameborder", "0");
+  iframe.setAttribute(
+    "sandbox",
+    "allow-scripts allow-popups allow-top-navigation-by-user-activation allow-forms allow-same-origin"
+  );
+
+  pendingIframe.appendChild(iframe);
+}
+
+const collectionSearchBoxes = document.getElementsByClassName("collectionSearch");
+for (let i = 0; i < collectionSearchBoxes.length; i++) {
+  const collectionSearchBox = collectionSearchBoxes.item(i).parentElement();
+  collectionSearchBox.style.display = "none";
+}
--- a/loconotion.py
+++ b/loconotion.py
@ -0,0 +1,356 @@
+import os
+import sys
+import requests
+import shutil
+import time
+import uuid
+import logging
+import re
+from rich.logging import RichHandler
+from rich.progress import Progress
+import urllib.parse
+import hashlib
+
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.common.exceptions import TimeoutException, NoSuchElementException
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait 
+
+from bs4 import BeautifulSoup
+from pathlib import Path
+import cssutils
+cssutils.log.setLevel(logging.CRITICAL) # removes warning logs from cssutils
+
+def setup_logger(name):
+  rich_handler = RichHandler()
+  logger = logging.getLogger(name)
+  logger.addHandler(rich_handler)
+  logger.setLevel(logging.DEBUG)
+  return logger
+
+log = setup_logger("loconotion-logger")
+
+def get_clean_slug(url, extension = True):
+  path = urllib.parse.urlparse(url).path.replace('/', '')
+  if ("-" in path and len(path.split("-")) > 1):
+    # a standard notion page looks like the-page-title-[uiid]
+    # strip the uuid and keep the page title only
+    path = "-".join(path.split("-")[:-1]).lower()
+  elif ("?" in path):
+    # database pages just have an uiid and a query param
+    # not much to do here, just get rid of the query param
+    path = path.split("?")[0].lower()
+  return path + (".html" if extension else "")
+
+def download_file(url, destination):
+  if not Path(destination).is_file():
+    # Disabling proxy speeds up requests time
+    # https://stackoverflow.com/questions/45783655/first-https-request-takes-much-more-time-than-the-rest
+    # https://stackoverflow.com/questions/28521535/requests-how-to-disable-bypass-proxy
+    session = requests.Session()
+    session.trust_env = False
+    parsed_url = urllib.parse.urlparse(url)
+    log.info(f"Downloading {parsed_url.scheme + parsed_url.netloc + parsed_url.path} to {destination}")
+    response = session.get(url)  
+    Path(destination).parent.mkdir(parents=True, exist_ok=True)
+    with open(destination, "wb") as f:
+      f.write(response.content)
+  else:
+    log.debug(f"File {destination} was already downloaded")
+  return destination
+
+# def rich_download_file(url, destination):
+#   if not Path(destination).is_file():
+#     progress = Progress(auto_refresh = True)
+#      # Disabling proxy speeds up requests time
+#     session = requests.Session()
+#     session.trust_env = False
+#     Path(destination).parent.mkdir(parents=True, exist_ok=True)
+#     with open(destination, 'wb') as f:
+#       response = session.get(url, stream=True)
+#       total = response.headers.get('content-length')
+#       task_id = progress.add_task(url)
+#       if total is None:
+#         f.write(response.content)
+#       else:
+#         progress.update(task_id, total=int(total))
+#         for data in response.iter_content(chunk_size=4096):
+#           f.write(data)
+#           progress.update(task_id, advance=len(data))
+#         progress.update(task_id, completed =int(total))
+#   else:
+#     log.debug(f"File {destination} was already downloaded")
+#   return destination
+
+class notion_page_loaded(object):
+  """An expectation for checking that a notion page has loaded.
+  """
+  def __call__(self, driver):
+    notion_presence = len(driver.find_elements_by_class_name("notion-presence-container"))
+    loading_spinners = len(driver.find_elements_by_class_name("loading-spinner"));
+    # embed_ghosts = len(driver.find_elements_by_css_selector("div[embed-ghost]"));
+    log.debug(f"Waiting for page content to load (presence container: {notion_presence}, loaders: {loading_spinners} )")
+    if (notion_presence and not loading_spinners):
+      return True
+    else:
+      return False
+
+
+class toggle_block_has_opened(object):
+  """An expectation for checking that a notion toggle block has been opened.
+  It does so by checking if the div hosting the content has enough children,
+  and the abscence of the loading spinner.
+  """
+  def __init__(self, toggle_block):
+    self.toggle_block = toggle_block
+
+  def __call__(self, driver):
+    toggle_content = self.toggle_block.find_element_by_css_selector("div:not([style]")
+    if (toggle_content):
+      content_children = len(toggle_content.find_elements_by_tag_name("div"))
+      is_loading = len(self.toggle_block.find_elements_by_class_name("loading-spinner"));
+      log.debug(f"Waiting for toggle block to load ({content_children} children so far and {is_loading} loaders)")
+      if (content_children > 3 and not is_loading):
+        return True
+      else:
+        return False
+    else:
+      return False
+
+class Parser():
+  def __init__(self, dist_folder):
+    self.dist_folder = Path(dist_folder)
+    self.driver = self.init_chromedriver()
+
+    # create output path if it doesn't exists
+    self.dist_folder.mkdir(parents=True, exist_ok=True)
+    log.info(f"Setting output path to {self.dist_folder}")
+
+  def init_chromedriver(self):
+    log.info("Initialising chrome driver")
+    chrome_options = Options()  
+    chrome_options.add_argument("--headless")  
+    chrome_options.add_argument("window-size=1920,1080")
+    chrome_options.add_argument("--log-level=3");
+    chrome_options.add_argument("--silent");
+    chrome_options.add_argument("--disable-logging")
+     # removes the 'DevTools listening' log message
+    chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
+    return webdriver.Chrome(
+      executable_path=str(Path.cwd() / "bin" / "chromedriver.exe"), 
+      service_log_path=str(Path.cwd() / "webdrive.log"),
+      options=chrome_options)
+
+  def parse_page(self, url, processed_pages, index = None):
+    # if this is the first page being parse, set it as the index.html
+    if (not index):
+      index = url;
+
+    log.info(f'Parsing page {url}')
+    self.driver.get(url)
+    try:
+      # WebDriverWait(self.driver, 10).until(notion_page_loaded())
+      WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'notion-presence-container')))
+    except TimeoutException as ex:
+      log.error("Timeout waiting for page content to load")
+      return
+
+    time.sleep(2)
+
+    # expands all the toggle block in the page to make their content visible
+    # we hook up our custom toggle logic afterwards
+    def open_toggle_blocks(exclude = []):
+      opened_toggles = exclude;
+      toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block")
+      log.debug(f"Opening {len(toggle_blocks)} new toggle blocks in the page")
+      for toggle_block in toggle_blocks:
+        if (not toggle_block in opened_toggles):
+          toggle_button = toggle_block.find_element_by_css_selector("div[role=button]")
+          # check if the toggle is already open by the direction of its arrow
+          is_toggled = "(180deg)" in (toggle_button.find_element_by_tag_name("svg").get_attribute("style"))
+          if (not is_toggled):
+            # click on it, then wait until all elements are displayed
+            toggle_button.click()
+            try:
+              WebDriverWait(self.driver, 10).until(toggle_block_has_opened(toggle_block))
+            except TimeoutException as ex:
+              log.warn("Timeout waiting for toggle block to open")   
+            opened_toggles.append(toggle_block) 
+      # after all toggles have been opened, check the page again to see if
+      # any toggle block had nested toggle blocks inside them
+      new_toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block")
+      if (len(new_toggle_blocks) > len(toggle_blocks)):
+        # if so, run the function again
+        open_toggle_blocks(opened_toggles)
+
+    open_toggle_blocks()
+
+    # creates soup from the page to start parsing
+    soup = BeautifulSoup(self.driver.page_source, "lxml")
+
+    # process eventual embedded iframes
+    for embed in soup.select('div[embed-ghost]'):
+      iframe = embed.find('iframe');
+      iframe_parent = iframe.parent
+      iframe_parent['class'] = iframe_parent.get('class', []) + ['loconotion-iframe-target']
+      iframe_parent['loconotion-iframe-src'] = iframe['src']
+
+    # process meta tags
+    def set_meta_tag(prop_name, prop_value, content):
+      tag = soup.find("meta", attrs = { prop_name : prop_value})
+      if (tag):
+        log.debug(f"Setting meta tag {prop_value} to {content}")
+        if (content): tag["content"] = content
+        else: tag.decompose();
+      else:
+        log.warn(f"Meta tag with {prop_name}: {prop_value} was not found")
+
+    set_meta_tag("name", "description", None)
+    set_meta_tag("name", "twitter:card", None)
+    set_meta_tag("name", "twitter:site", None)
+    set_meta_tag("name", "twitter:title", None)
+    set_meta_tag("name", "twitter:description", None)
+    set_meta_tag("name", "twitter:image", None)
+    set_meta_tag("name", "twitter:url", None)
+    set_meta_tag("property", "og:site_name", None)
+    set_meta_tag("property", "og:type", None)
+    set_meta_tag("property", "og:url", None)
+    set_meta_tag("property", "og:title", None)
+    set_meta_tag("property", "og:description", None)
+    set_meta_tag("property", "og:image", None)
+    set_meta_tag("name", "apple-itunes-app", None)
+
+    # process images
+    cache_images = True
+    for img in soup.findAll('img'):
+      if img.has_attr('src'):
+        if (cache_images):
+          img_src = img['src']
+
+          # if the path starts with /, it's one of notion's predefined images
+          if (img['src'].startswith('/')):
+            # notion's images urls are in a weird format, need to sanitize them
+            img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0]
+
+          # generate an hashed id for the image filename based the url,
+          # so we avoid re-downloading images we have already downloaded,
+          # and figure out the filename from the url (I know, just this once)
+          img_extension = Path(urllib.parse.urlparse(img_src).path).suffix
+          img_name = hashlib.sha1(str.encode(img_src)).hexdigest();
+          img_file = img_name + img_extension
+
+          download_file(img_src, self.dist_folder / img_file)
+          img['src'] = img_file
+        else:
+          if (img['src'].startswith('/')):
+            img['src'] = "https://www.notion.so" + img['src']
+
+    # process stylesheets
+    for link in soup.findAll('link', rel="stylesheet"):
+      if link.has_attr('href') and link['href'].startswith('/'):
+        # we don't need the vendors stylesheet
+        if ("vendors~" in link['href']):
+          continue
+        css_file = link['href'].replace('/', '')
+        saved_css_file = download_file('https://www.notion.so' + link['href'], self.dist_folder / css_file)
+        with open(saved_css_file, 'rb') as f:
+          stylesheet = cssutils.parseString(f.read())
+          # open the stylesheet and check for any font-face rule,
+          for rule in stylesheet.cssRules:
+            if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE:
+              # if any are found, download the font file
+              font_file = rule.style['src'].split("url(/")[-1].split(") format")[0]
+              download_file(f'https://www.notion.so/{font_file}', self.dist_folder / font_file)
+        link['href'] = css_file
+
+    # remove scripts and other tags we don't want / need
+    for unwanted in soup.findAll(['script', 'iframe']):
+      unwanted.decompose();
+    for intercom_div in soup.findAll('div',{'class':'intercom-lightweight-app'}):
+      intercom_div.decompose();
+    for overlay_div in soup.findAll('div',{'class':'notion-overlay-container'}):
+      overlay_div.decompose();
+
+    # add our custom logic to all toggle blocks
+    for toggle_block in soup.findAll('div',{'class':'notion-toggle-block'}):
+      toggle_id = uuid.uuid4() 
+      toggle_button = toggle_block.select_one('div[role=button]')
+      toggle_content = toggle_block.find('div', {'class': None, 'style': ''})
+      if (toggle_button and toggle_content):
+        # add a custom class to the toggle button and content, plus a custom attribute
+        # sharing a unique uiid so we can hook them up with some custom js logic later
+        toggle_button['class'] = toggle_block.get('class', []) + ['loconotion-toggle-button']
+        toggle_content['class'] = toggle_content.get('class', []) + ['loconotion-toggle-content']
+        toggle_content.attrs['loconotion-toggle-id'] = toggle_button.attrs['loconotion-toggle-id'] = toggle_id
+
+    # embed custom google font
+    custom_font = None
+    if (custom_font):
+      custom_font_stylesheet_stylesheet = soup.new_tag("link")
+      custom_font_stylesheet.attrs["rel"] = "stylesheet"
+      custom_font_stylesheet.attrs["href"] = f"https://fonts.googleapis.com/css2?family={custom_font}:wght@500;600;700&display=swap"
+      soup.head.insert(-1, custom_font_stylesheet)
+      for app in soup.findAll('div',{'class':'notion-app-inner'}):
+        style = cssutils.parseStyle(app['style']);
+        style['font-family'] = f"'{custom_font}', {style['font-family']}"
+        app['style'] = style.cssText
+
+    # append custom stylesheet
+    custom_css = soup.new_tag("link")
+    custom_css.attrs["rel"] = "stylesheet"
+    custom_css.attrs["href"] = "loconotion.css"
+    soup.head.insert(-1, custom_css)
+
+    # append custom script
+    custom_script = soup.new_tag("script")
+    custom_script.attrs["type"] = "text/javascript"
+    custom_script.attrs["src"] = "loconotion.js"
+    soup.body.insert(-1, custom_script)
+
+    # find sub-pages and clean slugs / links
+    sub_pages = [];
+    for a in soup.findAll('a'):
+      if a['href'].startswith('/'):
+        sub_page_href = 'https://www.notion.so' + a['href']
+        sub_pages.append(sub_page_href)
+        a['href'] = get_clean_slug(sub_page_href) if sub_page_href != index else "index.html"
+        log.debug(f"Found link to page {a['href']}")
+
+    # exports the parsed page
+    html_str = str(soup)
+    html_file = get_clean_slug(url) if url != index else "index.html"
+    log.info(f"Exporting page {url} as {html_file}")
+    with open(self.dist_folder / html_file, "wb") as f:
+      f.write(html_str.encode('utf-8').strip())
+    processed_pages.append(url)
+
+    # parse sub-pages
+    for sub_page in sub_pages:
+      if not sub_page in processed_pages:
+        self.parse_page(sub_page, processed_pages, index)
+
+  def run(self, url):
+    processed_pages = []
+    self.parse_page(url, processed_pages)
+
+    # copy custom assets to dist folder
+    shutil.copyfile("loconotion.css", self.dist_folder / "loconotion.css");
+    shutil.copyfile("loconotion.js", self.dist_folder / "loconotion.js");
+
+if __name__ == '__main__':
+  try:
+    url = "https://www.notion.so/leoncvlt-f276385bf5ce42969497f0b03aef907e"
+    output_folder = Path("dist") / get_clean_slug(url, extension = False)
+    parser = Parser(output_folder)
+    parser.run(url)
+    # parser.run("https://www.notion.so/A-Notion-Page-03c403f4fdc94cc1b315b9469a8950ef")
+    # parser.run("https://www.notion.so/Media-be1a5c3e1c9640a0ab9ba0ba9b67e6a5")
+    # parser.run('https://www.notion.so/leoncvlt-f276385bf5ce42969497f0b03aef907e')
+  except KeyboardInterrupt:
+    log.error('Interrupted by user')
+    try:
+      sys.exit(0)
+    except SystemExit:
+      os._exit(0)