mirror of
https://github.com/leoncvlt/loconotion.git
synced 2024-08-30 18:12:12 +00:00
356 lines
15 KiB
Python
356 lines
15 KiB
Python
|
import os
|
||
|
import sys
|
||
|
import requests
|
||
|
import shutil
|
||
|
import time
|
||
|
import uuid
|
||
|
import logging
|
||
|
import re
|
||
|
from rich.logging import RichHandler
|
||
|
from rich.progress import Progress
|
||
|
import urllib.parse
|
||
|
import hashlib
|
||
|
|
||
|
from selenium import webdriver
|
||
|
from selenium.webdriver.chrome.options import Options
|
||
|
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
||
|
from selenium.webdriver.support import expected_conditions as EC
|
||
|
from selenium.webdriver.common.by import By
|
||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||
|
|
||
|
from bs4 import BeautifulSoup
|
||
|
from pathlib import Path
|
||
|
import cssutils
|
||
|
cssutils.log.setLevel(logging.CRITICAL) # removes warning logs from cssutils
|
||
|
|
||
|
def setup_logger(name):
|
||
|
rich_handler = RichHandler()
|
||
|
logger = logging.getLogger(name)
|
||
|
logger.addHandler(rich_handler)
|
||
|
logger.setLevel(logging.DEBUG)
|
||
|
return logger
|
||
|
|
||
|
log = setup_logger("loconotion-logger")
|
||
|
|
||
|
def get_clean_slug(url, extension = True):
|
||
|
path = urllib.parse.urlparse(url).path.replace('/', '')
|
||
|
if ("-" in path and len(path.split("-")) > 1):
|
||
|
# a standard notion page looks like the-page-title-[uiid]
|
||
|
# strip the uuid and keep the page title only
|
||
|
path = "-".join(path.split("-")[:-1]).lower()
|
||
|
elif ("?" in path):
|
||
|
# database pages just have an uiid and a query param
|
||
|
# not much to do here, just get rid of the query param
|
||
|
path = path.split("?")[0].lower()
|
||
|
return path + (".html" if extension else "")
|
||
|
|
||
|
def download_file(url, destination):
|
||
|
if not Path(destination).is_file():
|
||
|
# Disabling proxy speeds up requests time
|
||
|
# https://stackoverflow.com/questions/45783655/first-https-request-takes-much-more-time-than-the-rest
|
||
|
# https://stackoverflow.com/questions/28521535/requests-how-to-disable-bypass-proxy
|
||
|
session = requests.Session()
|
||
|
session.trust_env = False
|
||
|
parsed_url = urllib.parse.urlparse(url)
|
||
|
log.info(f"Downloading {parsed_url.scheme + parsed_url.netloc + parsed_url.path} to {destination}")
|
||
|
response = session.get(url)
|
||
|
Path(destination).parent.mkdir(parents=True, exist_ok=True)
|
||
|
with open(destination, "wb") as f:
|
||
|
f.write(response.content)
|
||
|
else:
|
||
|
log.debug(f"File {destination} was already downloaded")
|
||
|
return destination
|
||
|
|
||
|
# def rich_download_file(url, destination):
|
||
|
# if not Path(destination).is_file():
|
||
|
# progress = Progress(auto_refresh = True)
|
||
|
# # Disabling proxy speeds up requests time
|
||
|
# session = requests.Session()
|
||
|
# session.trust_env = False
|
||
|
# Path(destination).parent.mkdir(parents=True, exist_ok=True)
|
||
|
# with open(destination, 'wb') as f:
|
||
|
# response = session.get(url, stream=True)
|
||
|
# total = response.headers.get('content-length')
|
||
|
# task_id = progress.add_task(url)
|
||
|
# if total is None:
|
||
|
# f.write(response.content)
|
||
|
# else:
|
||
|
# progress.update(task_id, total=int(total))
|
||
|
# for data in response.iter_content(chunk_size=4096):
|
||
|
# f.write(data)
|
||
|
# progress.update(task_id, advance=len(data))
|
||
|
# progress.update(task_id, completed =int(total))
|
||
|
# else:
|
||
|
# log.debug(f"File {destination} was already downloaded")
|
||
|
# return destination
|
||
|
|
||
|
class notion_page_loaded(object):
|
||
|
"""An expectation for checking that a notion page has loaded.
|
||
|
"""
|
||
|
def __call__(self, driver):
|
||
|
notion_presence = len(driver.find_elements_by_class_name("notion-presence-container"))
|
||
|
loading_spinners = len(driver.find_elements_by_class_name("loading-spinner"));
|
||
|
# embed_ghosts = len(driver.find_elements_by_css_selector("div[embed-ghost]"));
|
||
|
log.debug(f"Waiting for page content to load (presence container: {notion_presence}, loaders: {loading_spinners} )")
|
||
|
if (notion_presence and not loading_spinners):
|
||
|
return True
|
||
|
else:
|
||
|
return False
|
||
|
|
||
|
|
||
|
class toggle_block_has_opened(object):
|
||
|
"""An expectation for checking that a notion toggle block has been opened.
|
||
|
It does so by checking if the div hosting the content has enough children,
|
||
|
and the abscence of the loading spinner.
|
||
|
"""
|
||
|
def __init__(self, toggle_block):
|
||
|
self.toggle_block = toggle_block
|
||
|
|
||
|
def __call__(self, driver):
|
||
|
toggle_content = self.toggle_block.find_element_by_css_selector("div:not([style]")
|
||
|
if (toggle_content):
|
||
|
content_children = len(toggle_content.find_elements_by_tag_name("div"))
|
||
|
is_loading = len(self.toggle_block.find_elements_by_class_name("loading-spinner"));
|
||
|
log.debug(f"Waiting for toggle block to load ({content_children} children so far and {is_loading} loaders)")
|
||
|
if (content_children > 3 and not is_loading):
|
||
|
return True
|
||
|
else:
|
||
|
return False
|
||
|
else:
|
||
|
return False
|
||
|
|
||
|
class Parser():
|
||
|
def __init__(self, dist_folder):
|
||
|
self.dist_folder = Path(dist_folder)
|
||
|
self.driver = self.init_chromedriver()
|
||
|
|
||
|
# create output path if it doesn't exists
|
||
|
self.dist_folder.mkdir(parents=True, exist_ok=True)
|
||
|
log.info(f"Setting output path to {self.dist_folder}")
|
||
|
|
||
|
def init_chromedriver(self):
|
||
|
log.info("Initialising chrome driver")
|
||
|
chrome_options = Options()
|
||
|
chrome_options.add_argument("--headless")
|
||
|
chrome_options.add_argument("window-size=1920,1080")
|
||
|
chrome_options.add_argument("--log-level=3");
|
||
|
chrome_options.add_argument("--silent");
|
||
|
chrome_options.add_argument("--disable-logging")
|
||
|
# removes the 'DevTools listening' log message
|
||
|
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
|
||
|
return webdriver.Chrome(
|
||
|
executable_path=str(Path.cwd() / "bin" / "chromedriver.exe"),
|
||
|
service_log_path=str(Path.cwd() / "webdrive.log"),
|
||
|
options=chrome_options)
|
||
|
|
||
|
def parse_page(self, url, processed_pages, index = None):
|
||
|
# if this is the first page being parse, set it as the index.html
|
||
|
if (not index):
|
||
|
index = url;
|
||
|
|
||
|
log.info(f'Parsing page {url}')
|
||
|
self.driver.get(url)
|
||
|
try:
|
||
|
# WebDriverWait(self.driver, 10).until(notion_page_loaded())
|
||
|
WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'notion-presence-container')))
|
||
|
except TimeoutException as ex:
|
||
|
log.error("Timeout waiting for page content to load")
|
||
|
return
|
||
|
|
||
|
time.sleep(2)
|
||
|
|
||
|
# expands all the toggle block in the page to make their content visible
|
||
|
# we hook up our custom toggle logic afterwards
|
||
|
def open_toggle_blocks(exclude = []):
|
||
|
opened_toggles = exclude;
|
||
|
toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block")
|
||
|
log.debug(f"Opening {len(toggle_blocks)} new toggle blocks in the page")
|
||
|
for toggle_block in toggle_blocks:
|
||
|
if (not toggle_block in opened_toggles):
|
||
|
toggle_button = toggle_block.find_element_by_css_selector("div[role=button]")
|
||
|
# check if the toggle is already open by the direction of its arrow
|
||
|
is_toggled = "(180deg)" in (toggle_button.find_element_by_tag_name("svg").get_attribute("style"))
|
||
|
if (not is_toggled):
|
||
|
# click on it, then wait until all elements are displayed
|
||
|
toggle_button.click()
|
||
|
try:
|
||
|
WebDriverWait(self.driver, 10).until(toggle_block_has_opened(toggle_block))
|
||
|
except TimeoutException as ex:
|
||
|
log.warn("Timeout waiting for toggle block to open")
|
||
|
opened_toggles.append(toggle_block)
|
||
|
# after all toggles have been opened, check the page again to see if
|
||
|
# any toggle block had nested toggle blocks inside them
|
||
|
new_toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block")
|
||
|
if (len(new_toggle_blocks) > len(toggle_blocks)):
|
||
|
# if so, run the function again
|
||
|
open_toggle_blocks(opened_toggles)
|
||
|
|
||
|
open_toggle_blocks()
|
||
|
|
||
|
# creates soup from the page to start parsing
|
||
|
soup = BeautifulSoup(self.driver.page_source, "lxml")
|
||
|
|
||
|
# process eventual embedded iframes
|
||
|
for embed in soup.select('div[embed-ghost]'):
|
||
|
iframe = embed.find('iframe');
|
||
|
iframe_parent = iframe.parent
|
||
|
iframe_parent['class'] = iframe_parent.get('class', []) + ['loconotion-iframe-target']
|
||
|
iframe_parent['loconotion-iframe-src'] = iframe['src']
|
||
|
|
||
|
# process meta tags
|
||
|
def set_meta_tag(prop_name, prop_value, content):
|
||
|
tag = soup.find("meta", attrs = { prop_name : prop_value})
|
||
|
if (tag):
|
||
|
log.debug(f"Setting meta tag {prop_value} to {content}")
|
||
|
if (content): tag["content"] = content
|
||
|
else: tag.decompose();
|
||
|
else:
|
||
|
log.warn(f"Meta tag with {prop_name}: {prop_value} was not found")
|
||
|
|
||
|
set_meta_tag("name", "description", None)
|
||
|
set_meta_tag("name", "twitter:card", None)
|
||
|
set_meta_tag("name", "twitter:site", None)
|
||
|
set_meta_tag("name", "twitter:title", None)
|
||
|
set_meta_tag("name", "twitter:description", None)
|
||
|
set_meta_tag("name", "twitter:image", None)
|
||
|
set_meta_tag("name", "twitter:url", None)
|
||
|
set_meta_tag("property", "og:site_name", None)
|
||
|
set_meta_tag("property", "og:type", None)
|
||
|
set_meta_tag("property", "og:url", None)
|
||
|
set_meta_tag("property", "og:title", None)
|
||
|
set_meta_tag("property", "og:description", None)
|
||
|
set_meta_tag("property", "og:image", None)
|
||
|
set_meta_tag("name", "apple-itunes-app", None)
|
||
|
|
||
|
# process images
|
||
|
cache_images = True
|
||
|
for img in soup.findAll('img'):
|
||
|
if img.has_attr('src'):
|
||
|
if (cache_images):
|
||
|
img_src = img['src']
|
||
|
|
||
|
# if the path starts with /, it's one of notion's predefined images
|
||
|
if (img['src'].startswith('/')):
|
||
|
# notion's images urls are in a weird format, need to sanitize them
|
||
|
img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0]
|
||
|
|
||
|
# generate an hashed id for the image filename based the url,
|
||
|
# so we avoid re-downloading images we have already downloaded,
|
||
|
# and figure out the filename from the url (I know, just this once)
|
||
|
img_extension = Path(urllib.parse.urlparse(img_src).path).suffix
|
||
|
img_name = hashlib.sha1(str.encode(img_src)).hexdigest();
|
||
|
img_file = img_name + img_extension
|
||
|
|
||
|
download_file(img_src, self.dist_folder / img_file)
|
||
|
img['src'] = img_file
|
||
|
else:
|
||
|
if (img['src'].startswith('/')):
|
||
|
img['src'] = "https://www.notion.so" + img['src']
|
||
|
|
||
|
# process stylesheets
|
||
|
for link in soup.findAll('link', rel="stylesheet"):
|
||
|
if link.has_attr('href') and link['href'].startswith('/'):
|
||
|
# we don't need the vendors stylesheet
|
||
|
if ("vendors~" in link['href']):
|
||
|
continue
|
||
|
css_file = link['href'].replace('/', '')
|
||
|
saved_css_file = download_file('https://www.notion.so' + link['href'], self.dist_folder / css_file)
|
||
|
with open(saved_css_file, 'rb') as f:
|
||
|
stylesheet = cssutils.parseString(f.read())
|
||
|
# open the stylesheet and check for any font-face rule,
|
||
|
for rule in stylesheet.cssRules:
|
||
|
if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE:
|
||
|
# if any are found, download the font file
|
||
|
font_file = rule.style['src'].split("url(/")[-1].split(") format")[0]
|
||
|
download_file(f'https://www.notion.so/{font_file}', self.dist_folder / font_file)
|
||
|
link['href'] = css_file
|
||
|
|
||
|
# remove scripts and other tags we don't want / need
|
||
|
for unwanted in soup.findAll(['script', 'iframe']):
|
||
|
unwanted.decompose();
|
||
|
for intercom_div in soup.findAll('div',{'class':'intercom-lightweight-app'}):
|
||
|
intercom_div.decompose();
|
||
|
for overlay_div in soup.findAll('div',{'class':'notion-overlay-container'}):
|
||
|
overlay_div.decompose();
|
||
|
|
||
|
# add our custom logic to all toggle blocks
|
||
|
for toggle_block in soup.findAll('div',{'class':'notion-toggle-block'}):
|
||
|
toggle_id = uuid.uuid4()
|
||
|
toggle_button = toggle_block.select_one('div[role=button]')
|
||
|
toggle_content = toggle_block.find('div', {'class': None, 'style': ''})
|
||
|
if (toggle_button and toggle_content):
|
||
|
# add a custom class to the toggle button and content, plus a custom attribute
|
||
|
# sharing a unique uiid so we can hook them up with some custom js logic later
|
||
|
toggle_button['class'] = toggle_block.get('class', []) + ['loconotion-toggle-button']
|
||
|
toggle_content['class'] = toggle_content.get('class', []) + ['loconotion-toggle-content']
|
||
|
toggle_content.attrs['loconotion-toggle-id'] = toggle_button.attrs['loconotion-toggle-id'] = toggle_id
|
||
|
|
||
|
# embed custom google font
|
||
|
custom_font = None
|
||
|
if (custom_font):
|
||
|
custom_font_stylesheet_stylesheet = soup.new_tag("link")
|
||
|
custom_font_stylesheet.attrs["rel"] = "stylesheet"
|
||
|
custom_font_stylesheet.attrs["href"] = f"https://fonts.googleapis.com/css2?family={custom_font}:wght@500;600;700&display=swap"
|
||
|
soup.head.insert(-1, custom_font_stylesheet)
|
||
|
for app in soup.findAll('div',{'class':'notion-app-inner'}):
|
||
|
style = cssutils.parseStyle(app['style']);
|
||
|
style['font-family'] = f"'{custom_font}', {style['font-family']}"
|
||
|
app['style'] = style.cssText
|
||
|
|
||
|
# append custom stylesheet
|
||
|
custom_css = soup.new_tag("link")
|
||
|
custom_css.attrs["rel"] = "stylesheet"
|
||
|
custom_css.attrs["href"] = "loconotion.css"
|
||
|
soup.head.insert(-1, custom_css)
|
||
|
|
||
|
# append custom script
|
||
|
custom_script = soup.new_tag("script")
|
||
|
custom_script.attrs["type"] = "text/javascript"
|
||
|
custom_script.attrs["src"] = "loconotion.js"
|
||
|
soup.body.insert(-1, custom_script)
|
||
|
|
||
|
# find sub-pages and clean slugs / links
|
||
|
sub_pages = [];
|
||
|
for a in soup.findAll('a'):
|
||
|
if a['href'].startswith('/'):
|
||
|
sub_page_href = 'https://www.notion.so' + a['href']
|
||
|
sub_pages.append(sub_page_href)
|
||
|
a['href'] = get_clean_slug(sub_page_href) if sub_page_href != index else "index.html"
|
||
|
log.debug(f"Found link to page {a['href']}")
|
||
|
|
||
|
# exports the parsed page
|
||
|
html_str = str(soup)
|
||
|
html_file = get_clean_slug(url) if url != index else "index.html"
|
||
|
log.info(f"Exporting page {url} as {html_file}")
|
||
|
with open(self.dist_folder / html_file, "wb") as f:
|
||
|
f.write(html_str.encode('utf-8').strip())
|
||
|
processed_pages.append(url)
|
||
|
|
||
|
# parse sub-pages
|
||
|
for sub_page in sub_pages:
|
||
|
if not sub_page in processed_pages:
|
||
|
self.parse_page(sub_page, processed_pages, index)
|
||
|
|
||
|
def run(self, url):
|
||
|
processed_pages = []
|
||
|
self.parse_page(url, processed_pages)
|
||
|
|
||
|
# copy custom assets to dist folder
|
||
|
shutil.copyfile("loconotion.css", self.dist_folder / "loconotion.css");
|
||
|
shutil.copyfile("loconotion.js", self.dist_folder / "loconotion.js");
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
try:
|
||
|
url = "https://www.notion.so/leoncvlt-f276385bf5ce42969497f0b03aef907e"
|
||
|
output_folder = Path("dist") / get_clean_slug(url, extension = False)
|
||
|
parser = Parser(output_folder)
|
||
|
parser.run(url)
|
||
|
# parser.run("https://www.notion.so/A-Notion-Page-03c403f4fdc94cc1b315b9469a8950ef")
|
||
|
# parser.run("https://www.notion.so/Media-be1a5c3e1c9640a0ab9ba0ba9b67e6a5")
|
||
|
# parser.run('https://www.notion.so/leoncvlt-f276385bf5ce42969497f0b03aef907e')
|
||
|
except KeyboardInterrupt:
|
||
|
log.error('Interrupted by user')
|
||
|
try:
|
||
|
sys.exit(0)
|
||
|
except SystemExit:
|
||
|
os._exit(0)
|