Initial project commit

Working version with no config exposed
This commit is contained in:
Leonardo Cavaletti 2020-05-16 18:24:40 +01:00
parent 7fd64af08a
commit b0a0baf9fb
5 changed files with 596 additions and 0 deletions

117
.gitignore vendored Normal file
View File

@ -0,0 +1,117 @@
# Created by https://www.gitignore.io/api/python
# Edit at https://www.gitignore.io/?templates=python
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# Mr Developer
.mr.developer.cfg
.project
.pydevproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# End of https://www.gitignore.io/api/python
.vscode
env
dist/*
debug.log
webdrive.log
*.bat

BIN
bin/chromedriver.exe Normal file

Binary file not shown.

62
loconotion.css Normal file
View File

@ -0,0 +1,62 @@
/* enables hover effect on buttons */
div[role="button"]:not(.notion-record-icon):hover {
background: rgba(55, 53, 47, 0.08);
}
/* hides loading spinner */
.loading-spinner {
display: none !important;
}
/* hides elements on the top right (search box, duplicate, notion shortcut) */
.notion-topbar > div > div:not(:first-child) {
display: none !important;
}
@media only screen and (max-width: 960px) {
/* normalizes banner width */
.notion-scroller > div > div:not([class]) {
width: 100% !important;
max-width: 900px !important;
padding-left: 0 !important;
padding-right: 0 !important;
}
/* normalizes content width */
.notion-page-content {
width: 100% !important;
max-width: unset !important;
padding-right: 0 !important;
padding-left: 0 !important;
}
/* normalizes database views width */
.notion-list-view,
.notion-gallery-view,
.notion-table-view,
.notion-board-view,
.notion-calendar-view {
padding-left: 0 !important;
padding-right: 0 !important;
}
/* add padding to banner, but not to image */
.notion-scroller > div:first-child > div:last-child {
padding-right: 2em !important;
padding-left: 2em !important;
}
/* add padding to content */
.notion-scroller > div:nth-child(2) {
padding-right: 2em !important;
padding-left: 2em !important;
}
/* collapses flex rows into columns */
.notion-column_list-block > div {
flex-direction: column;
}
.notion-column_list-block > div > * {
width: unset !important;
}
}

61
loconotion.js Normal file
View File

@ -0,0 +1,61 @@
const showToggle = (content, arrow) => {
arrow.style.transform = "rotateZ(180deg)";
content.style.display = "block";
};
const hideToggle = (content, arrow) => {
arrow.style.transform = "rotateZ(90deg)";
content.style.display = "none";
};
const toggleButtons = document.getElementsByClassName("loconotion-toggle-button");
for (let i = 0; i < toggleButtons.length; i++) {
const toggleButton = toggleButtons.item(i);
const toggleId = toggleButton.getAttribute("loconotion-toggle-id");
const toggleContent = document.querySelector(`.loconotion-toggle-content[loconotion-toggle-id='${toggleId}']`);
const toggleArrow = toggleButton.querySelector("svg");
if (toggleButton && toggleContent) {
hideToggle(toggleContent, toggleArrow);
toggleButton.addEventListener("click", () => {
if (toggleContent.style.display == "none") {
showToggle(toggleContent, toggleArrow);
} else {
hideToggle(toggleContent, toggleArrow);
}
});
}
}
const pendingIframes = document.getElementsByClassName("loconotion-iframe-target");
for (let i = 0; i < pendingIframes.length; i++) {
const pendingIframe = pendingIframes.item(i);
const iframeSrc = pendingIframe.getAttribute("loconotion-iframe-src");
const iframe = document.createElement("iframe");
pendingIframe.style.opacity = 0;
iframe.onload = () => {
pendingIframe.style.opacity = 1;
};
iframe.style.width = "100%";
iframe.style.height = "100%";
iframe.style.position = "absolute";
iframe.style.left = 0;
iframe.style.top = 0;
iframe.style.pointerEvents = "auto";
iframe.setAttribute("src", iframeSrc);
iframe.setAttribute("frameborder", "0");
iframe.setAttribute(
"sandbox",
"allow-scripts allow-popups allow-top-navigation-by-user-activation allow-forms allow-same-origin"
);
pendingIframe.appendChild(iframe);
}
const collectionSearchBoxes = document.getElementsByClassName("collectionSearch");
for (let i = 0; i < collectionSearchBoxes.length; i++) {
const collectionSearchBox = collectionSearchBoxes.item(i).parentElement();
collectionSearchBox.style.display = "none";
}

356
loconotion.py Normal file
View File

@ -0,0 +1,356 @@
import os
import sys
import requests
import shutil
import time
import uuid
import logging
import re
from rich.logging import RichHandler
from rich.progress import Progress
import urllib.parse
import hashlib
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
from pathlib import Path
import cssutils
cssutils.log.setLevel(logging.CRITICAL) # removes warning logs from cssutils
def setup_logger(name):
rich_handler = RichHandler()
logger = logging.getLogger(name)
logger.addHandler(rich_handler)
logger.setLevel(logging.DEBUG)
return logger
log = setup_logger("loconotion-logger")
def get_clean_slug(url, extension = True):
path = urllib.parse.urlparse(url).path.replace('/', '')
if ("-" in path and len(path.split("-")) > 1):
# a standard notion page looks like the-page-title-[uiid]
# strip the uuid and keep the page title only
path = "-".join(path.split("-")[:-1]).lower()
elif ("?" in path):
# database pages just have an uiid and a query param
# not much to do here, just get rid of the query param
path = path.split("?")[0].lower()
return path + (".html" if extension else "")
def download_file(url, destination):
if not Path(destination).is_file():
# Disabling proxy speeds up requests time
# https://stackoverflow.com/questions/45783655/first-https-request-takes-much-more-time-than-the-rest
# https://stackoverflow.com/questions/28521535/requests-how-to-disable-bypass-proxy
session = requests.Session()
session.trust_env = False
parsed_url = urllib.parse.urlparse(url)
log.info(f"Downloading {parsed_url.scheme + parsed_url.netloc + parsed_url.path} to {destination}")
response = session.get(url)
Path(destination).parent.mkdir(parents=True, exist_ok=True)
with open(destination, "wb") as f:
f.write(response.content)
else:
log.debug(f"File {destination} was already downloaded")
return destination
# def rich_download_file(url, destination):
# if not Path(destination).is_file():
# progress = Progress(auto_refresh = True)
# # Disabling proxy speeds up requests time
# session = requests.Session()
# session.trust_env = False
# Path(destination).parent.mkdir(parents=True, exist_ok=True)
# with open(destination, 'wb') as f:
# response = session.get(url, stream=True)
# total = response.headers.get('content-length')
# task_id = progress.add_task(url)
# if total is None:
# f.write(response.content)
# else:
# progress.update(task_id, total=int(total))
# for data in response.iter_content(chunk_size=4096):
# f.write(data)
# progress.update(task_id, advance=len(data))
# progress.update(task_id, completed =int(total))
# else:
# log.debug(f"File {destination} was already downloaded")
# return destination
class notion_page_loaded(object):
"""An expectation for checking that a notion page has loaded.
"""
def __call__(self, driver):
notion_presence = len(driver.find_elements_by_class_name("notion-presence-container"))
loading_spinners = len(driver.find_elements_by_class_name("loading-spinner"));
# embed_ghosts = len(driver.find_elements_by_css_selector("div[embed-ghost]"));
log.debug(f"Waiting for page content to load (presence container: {notion_presence}, loaders: {loading_spinners} )")
if (notion_presence and not loading_spinners):
return True
else:
return False
class toggle_block_has_opened(object):
"""An expectation for checking that a notion toggle block has been opened.
It does so by checking if the div hosting the content has enough children,
and the abscence of the loading spinner.
"""
def __init__(self, toggle_block):
self.toggle_block = toggle_block
def __call__(self, driver):
toggle_content = self.toggle_block.find_element_by_css_selector("div:not([style]")
if (toggle_content):
content_children = len(toggle_content.find_elements_by_tag_name("div"))
is_loading = len(self.toggle_block.find_elements_by_class_name("loading-spinner"));
log.debug(f"Waiting for toggle block to load ({content_children} children so far and {is_loading} loaders)")
if (content_children > 3 and not is_loading):
return True
else:
return False
else:
return False
class Parser():
def __init__(self, dist_folder):
self.dist_folder = Path(dist_folder)
self.driver = self.init_chromedriver()
# create output path if it doesn't exists
self.dist_folder.mkdir(parents=True, exist_ok=True)
log.info(f"Setting output path to {self.dist_folder}")
def init_chromedriver(self):
log.info("Initialising chrome driver")
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("window-size=1920,1080")
chrome_options.add_argument("--log-level=3");
chrome_options.add_argument("--silent");
chrome_options.add_argument("--disable-logging")
# removes the 'DevTools listening' log message
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
return webdriver.Chrome(
executable_path=str(Path.cwd() / "bin" / "chromedriver.exe"),
service_log_path=str(Path.cwd() / "webdrive.log"),
options=chrome_options)
def parse_page(self, url, processed_pages, index = None):
# if this is the first page being parse, set it as the index.html
if (not index):
index = url;
log.info(f'Parsing page {url}')
self.driver.get(url)
try:
# WebDriverWait(self.driver, 10).until(notion_page_loaded())
WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'notion-presence-container')))
except TimeoutException as ex:
log.error("Timeout waiting for page content to load")
return
time.sleep(2)
# expands all the toggle block in the page to make their content visible
# we hook up our custom toggle logic afterwards
def open_toggle_blocks(exclude = []):
opened_toggles = exclude;
toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block")
log.debug(f"Opening {len(toggle_blocks)} new toggle blocks in the page")
for toggle_block in toggle_blocks:
if (not toggle_block in opened_toggles):
toggle_button = toggle_block.find_element_by_css_selector("div[role=button]")
# check if the toggle is already open by the direction of its arrow
is_toggled = "(180deg)" in (toggle_button.find_element_by_tag_name("svg").get_attribute("style"))
if (not is_toggled):
# click on it, then wait until all elements are displayed
toggle_button.click()
try:
WebDriverWait(self.driver, 10).until(toggle_block_has_opened(toggle_block))
except TimeoutException as ex:
log.warn("Timeout waiting for toggle block to open")
opened_toggles.append(toggle_block)
# after all toggles have been opened, check the page again to see if
# any toggle block had nested toggle blocks inside them
new_toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block")
if (len(new_toggle_blocks) > len(toggle_blocks)):
# if so, run the function again
open_toggle_blocks(opened_toggles)
open_toggle_blocks()
# creates soup from the page to start parsing
soup = BeautifulSoup(self.driver.page_source, "lxml")
# process eventual embedded iframes
for embed in soup.select('div[embed-ghost]'):
iframe = embed.find('iframe');
iframe_parent = iframe.parent
iframe_parent['class'] = iframe_parent.get('class', []) + ['loconotion-iframe-target']
iframe_parent['loconotion-iframe-src'] = iframe['src']
# process meta tags
def set_meta_tag(prop_name, prop_value, content):
tag = soup.find("meta", attrs = { prop_name : prop_value})
if (tag):
log.debug(f"Setting meta tag {prop_value} to {content}")
if (content): tag["content"] = content
else: tag.decompose();
else:
log.warn(f"Meta tag with {prop_name}: {prop_value} was not found")
set_meta_tag("name", "description", None)
set_meta_tag("name", "twitter:card", None)
set_meta_tag("name", "twitter:site", None)
set_meta_tag("name", "twitter:title", None)
set_meta_tag("name", "twitter:description", None)
set_meta_tag("name", "twitter:image", None)
set_meta_tag("name", "twitter:url", None)
set_meta_tag("property", "og:site_name", None)
set_meta_tag("property", "og:type", None)
set_meta_tag("property", "og:url", None)
set_meta_tag("property", "og:title", None)
set_meta_tag("property", "og:description", None)
set_meta_tag("property", "og:image", None)
set_meta_tag("name", "apple-itunes-app", None)
# process images
cache_images = True
for img in soup.findAll('img'):
if img.has_attr('src'):
if (cache_images):
img_src = img['src']
# if the path starts with /, it's one of notion's predefined images
if (img['src'].startswith('/')):
# notion's images urls are in a weird format, need to sanitize them
img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0]
# generate an hashed id for the image filename based the url,
# so we avoid re-downloading images we have already downloaded,
# and figure out the filename from the url (I know, just this once)
img_extension = Path(urllib.parse.urlparse(img_src).path).suffix
img_name = hashlib.sha1(str.encode(img_src)).hexdigest();
img_file = img_name + img_extension
download_file(img_src, self.dist_folder / img_file)
img['src'] = img_file
else:
if (img['src'].startswith('/')):
img['src'] = "https://www.notion.so" + img['src']
# process stylesheets
for link in soup.findAll('link', rel="stylesheet"):
if link.has_attr('href') and link['href'].startswith('/'):
# we don't need the vendors stylesheet
if ("vendors~" in link['href']):
continue
css_file = link['href'].replace('/', '')
saved_css_file = download_file('https://www.notion.so' + link['href'], self.dist_folder / css_file)
with open(saved_css_file, 'rb') as f:
stylesheet = cssutils.parseString(f.read())
# open the stylesheet and check for any font-face rule,
for rule in stylesheet.cssRules:
if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE:
# if any are found, download the font file
font_file = rule.style['src'].split("url(/")[-1].split(") format")[0]
download_file(f'https://www.notion.so/{font_file}', self.dist_folder / font_file)
link['href'] = css_file
# remove scripts and other tags we don't want / need
for unwanted in soup.findAll(['script', 'iframe']):
unwanted.decompose();
for intercom_div in soup.findAll('div',{'class':'intercom-lightweight-app'}):
intercom_div.decompose();
for overlay_div in soup.findAll('div',{'class':'notion-overlay-container'}):
overlay_div.decompose();
# add our custom logic to all toggle blocks
for toggle_block in soup.findAll('div',{'class':'notion-toggle-block'}):
toggle_id = uuid.uuid4()
toggle_button = toggle_block.select_one('div[role=button]')
toggle_content = toggle_block.find('div', {'class': None, 'style': ''})
if (toggle_button and toggle_content):
# add a custom class to the toggle button and content, plus a custom attribute
# sharing a unique uiid so we can hook them up with some custom js logic later
toggle_button['class'] = toggle_block.get('class', []) + ['loconotion-toggle-button']
toggle_content['class'] = toggle_content.get('class', []) + ['loconotion-toggle-content']
toggle_content.attrs['loconotion-toggle-id'] = toggle_button.attrs['loconotion-toggle-id'] = toggle_id
# embed custom google font
custom_font = None
if (custom_font):
custom_font_stylesheet_stylesheet = soup.new_tag("link")
custom_font_stylesheet.attrs["rel"] = "stylesheet"
custom_font_stylesheet.attrs["href"] = f"https://fonts.googleapis.com/css2?family={custom_font}:wght@500;600;700&display=swap"
soup.head.insert(-1, custom_font_stylesheet)
for app in soup.findAll('div',{'class':'notion-app-inner'}):
style = cssutils.parseStyle(app['style']);
style['font-family'] = f"'{custom_font}', {style['font-family']}"
app['style'] = style.cssText
# append custom stylesheet
custom_css = soup.new_tag("link")
custom_css.attrs["rel"] = "stylesheet"
custom_css.attrs["href"] = "loconotion.css"
soup.head.insert(-1, custom_css)
# append custom script
custom_script = soup.new_tag("script")
custom_script.attrs["type"] = "text/javascript"
custom_script.attrs["src"] = "loconotion.js"
soup.body.insert(-1, custom_script)
# find sub-pages and clean slugs / links
sub_pages = [];
for a in soup.findAll('a'):
if a['href'].startswith('/'):
sub_page_href = 'https://www.notion.so' + a['href']
sub_pages.append(sub_page_href)
a['href'] = get_clean_slug(sub_page_href) if sub_page_href != index else "index.html"
log.debug(f"Found link to page {a['href']}")
# exports the parsed page
html_str = str(soup)
html_file = get_clean_slug(url) if url != index else "index.html"
log.info(f"Exporting page {url} as {html_file}")
with open(self.dist_folder / html_file, "wb") as f:
f.write(html_str.encode('utf-8').strip())
processed_pages.append(url)
# parse sub-pages
for sub_page in sub_pages:
if not sub_page in processed_pages:
self.parse_page(sub_page, processed_pages, index)
def run(self, url):
processed_pages = []
self.parse_page(url, processed_pages)
# copy custom assets to dist folder
shutil.copyfile("loconotion.css", self.dist_folder / "loconotion.css");
shutil.copyfile("loconotion.js", self.dist_folder / "loconotion.js");
if __name__ == '__main__':
try:
url = "https://www.notion.so/leoncvlt-f276385bf5ce42969497f0b03aef907e"
output_folder = Path("dist") / get_clean_slug(url, extension = False)
parser = Parser(output_folder)
parser.run(url)
# parser.run("https://www.notion.so/A-Notion-Page-03c403f4fdc94cc1b315b9469a8950ef")
# parser.run("https://www.notion.so/Media-be1a5c3e1c9640a0ab9ba0ba9b67e6a5")
# parser.run('https://www.notion.so/leoncvlt-f276385bf5ce42969497f0b03aef907e')
except KeyboardInterrupt:
log.error('Interrupted by user')
try:
sys.exit(0)
except SystemExit:
os._exit(0)