mirror of
https://github.com/leoncvlt/loconotion.git
synced 2024-08-30 18:12:12 +00:00
Initial project commit
Working version with no config exposed
This commit is contained in:
parent
7fd64af08a
commit
b0a0baf9fb
117
.gitignore
vendored
Normal file
117
.gitignore
vendored
Normal file
@ -0,0 +1,117 @@
|
||||
|
||||
# Created by https://www.gitignore.io/api/python
|
||||
# Edit at https://www.gitignore.io/?templates=python
|
||||
|
||||
### Python ###
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# Mr Developer
|
||||
.mr.developer.cfg
|
||||
.project
|
||||
.pydevproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# End of https://www.gitignore.io/api/python
|
||||
|
||||
.vscode
|
||||
env
|
||||
dist/*
|
||||
debug.log
|
||||
webdrive.log
|
||||
*.bat
|
BIN
bin/chromedriver.exe
Normal file
BIN
bin/chromedriver.exe
Normal file
Binary file not shown.
62
loconotion.css
Normal file
62
loconotion.css
Normal file
@ -0,0 +1,62 @@
|
||||
/* enables hover effect on buttons */
|
||||
div[role="button"]:not(.notion-record-icon):hover {
|
||||
background: rgba(55, 53, 47, 0.08);
|
||||
}
|
||||
|
||||
/* hides loading spinner */
|
||||
.loading-spinner {
|
||||
display: none !important;
|
||||
}
|
||||
|
||||
/* hides elements on the top right (search box, duplicate, notion shortcut) */
|
||||
.notion-topbar > div > div:not(:first-child) {
|
||||
display: none !important;
|
||||
}
|
||||
|
||||
@media only screen and (max-width: 960px) {
|
||||
/* normalizes banner width */
|
||||
.notion-scroller > div > div:not([class]) {
|
||||
width: 100% !important;
|
||||
max-width: 900px !important;
|
||||
padding-left: 0 !important;
|
||||
padding-right: 0 !important;
|
||||
}
|
||||
|
||||
/* normalizes content width */
|
||||
.notion-page-content {
|
||||
width: 100% !important;
|
||||
max-width: unset !important;
|
||||
padding-right: 0 !important;
|
||||
padding-left: 0 !important;
|
||||
}
|
||||
|
||||
/* normalizes database views width */
|
||||
.notion-list-view,
|
||||
.notion-gallery-view,
|
||||
.notion-table-view,
|
||||
.notion-board-view,
|
||||
.notion-calendar-view {
|
||||
padding-left: 0 !important;
|
||||
padding-right: 0 !important;
|
||||
}
|
||||
|
||||
/* add padding to banner, but not to image */
|
||||
.notion-scroller > div:first-child > div:last-child {
|
||||
padding-right: 2em !important;
|
||||
padding-left: 2em !important;
|
||||
}
|
||||
|
||||
/* add padding to content */
|
||||
.notion-scroller > div:nth-child(2) {
|
||||
padding-right: 2em !important;
|
||||
padding-left: 2em !important;
|
||||
}
|
||||
|
||||
/* collapses flex rows into columns */
|
||||
.notion-column_list-block > div {
|
||||
flex-direction: column;
|
||||
}
|
||||
.notion-column_list-block > div > * {
|
||||
width: unset !important;
|
||||
}
|
||||
}
|
61
loconotion.js
Normal file
61
loconotion.js
Normal file
@ -0,0 +1,61 @@
|
||||
const showToggle = (content, arrow) => {
|
||||
arrow.style.transform = "rotateZ(180deg)";
|
||||
content.style.display = "block";
|
||||
};
|
||||
|
||||
const hideToggle = (content, arrow) => {
|
||||
arrow.style.transform = "rotateZ(90deg)";
|
||||
content.style.display = "none";
|
||||
};
|
||||
|
||||
const toggleButtons = document.getElementsByClassName("loconotion-toggle-button");
|
||||
for (let i = 0; i < toggleButtons.length; i++) {
|
||||
const toggleButton = toggleButtons.item(i);
|
||||
const toggleId = toggleButton.getAttribute("loconotion-toggle-id");
|
||||
const toggleContent = document.querySelector(`.loconotion-toggle-content[loconotion-toggle-id='${toggleId}']`);
|
||||
const toggleArrow = toggleButton.querySelector("svg");
|
||||
if (toggleButton && toggleContent) {
|
||||
hideToggle(toggleContent, toggleArrow);
|
||||
toggleButton.addEventListener("click", () => {
|
||||
if (toggleContent.style.display == "none") {
|
||||
showToggle(toggleContent, toggleArrow);
|
||||
} else {
|
||||
hideToggle(toggleContent, toggleArrow);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const pendingIframes = document.getElementsByClassName("loconotion-iframe-target");
|
||||
for (let i = 0; i < pendingIframes.length; i++) {
|
||||
const pendingIframe = pendingIframes.item(i);
|
||||
const iframeSrc = pendingIframe.getAttribute("loconotion-iframe-src");
|
||||
const iframe = document.createElement("iframe");
|
||||
|
||||
pendingIframe.style.opacity = 0;
|
||||
iframe.onload = () => {
|
||||
pendingIframe.style.opacity = 1;
|
||||
};
|
||||
|
||||
iframe.style.width = "100%";
|
||||
iframe.style.height = "100%";
|
||||
iframe.style.position = "absolute";
|
||||
iframe.style.left = 0;
|
||||
iframe.style.top = 0;
|
||||
iframe.style.pointerEvents = "auto";
|
||||
|
||||
iframe.setAttribute("src", iframeSrc);
|
||||
iframe.setAttribute("frameborder", "0");
|
||||
iframe.setAttribute(
|
||||
"sandbox",
|
||||
"allow-scripts allow-popups allow-top-navigation-by-user-activation allow-forms allow-same-origin"
|
||||
);
|
||||
|
||||
pendingIframe.appendChild(iframe);
|
||||
}
|
||||
|
||||
const collectionSearchBoxes = document.getElementsByClassName("collectionSearch");
|
||||
for (let i = 0; i < collectionSearchBoxes.length; i++) {
|
||||
const collectionSearchBox = collectionSearchBoxes.item(i).parentElement();
|
||||
collectionSearchBox.style.display = "none";
|
||||
}
|
356
loconotion.py
Normal file
356
loconotion.py
Normal file
@ -0,0 +1,356 @@
|
||||
import os
|
||||
import sys
|
||||
import requests
|
||||
import shutil
|
||||
import time
|
||||
import uuid
|
||||
import logging
|
||||
import re
|
||||
from rich.logging import RichHandler
|
||||
from rich.progress import Progress
|
||||
import urllib.parse
|
||||
import hashlib
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from pathlib import Path
|
||||
import cssutils
|
||||
cssutils.log.setLevel(logging.CRITICAL) # removes warning logs from cssutils
|
||||
|
||||
def setup_logger(name):
|
||||
rich_handler = RichHandler()
|
||||
logger = logging.getLogger(name)
|
||||
logger.addHandler(rich_handler)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
return logger
|
||||
|
||||
log = setup_logger("loconotion-logger")
|
||||
|
||||
def get_clean_slug(url, extension = True):
|
||||
path = urllib.parse.urlparse(url).path.replace('/', '')
|
||||
if ("-" in path and len(path.split("-")) > 1):
|
||||
# a standard notion page looks like the-page-title-[uiid]
|
||||
# strip the uuid and keep the page title only
|
||||
path = "-".join(path.split("-")[:-1]).lower()
|
||||
elif ("?" in path):
|
||||
# database pages just have an uiid and a query param
|
||||
# not much to do here, just get rid of the query param
|
||||
path = path.split("?")[0].lower()
|
||||
return path + (".html" if extension else "")
|
||||
|
||||
def download_file(url, destination):
|
||||
if not Path(destination).is_file():
|
||||
# Disabling proxy speeds up requests time
|
||||
# https://stackoverflow.com/questions/45783655/first-https-request-takes-much-more-time-than-the-rest
|
||||
# https://stackoverflow.com/questions/28521535/requests-how-to-disable-bypass-proxy
|
||||
session = requests.Session()
|
||||
session.trust_env = False
|
||||
parsed_url = urllib.parse.urlparse(url)
|
||||
log.info(f"Downloading {parsed_url.scheme + parsed_url.netloc + parsed_url.path} to {destination}")
|
||||
response = session.get(url)
|
||||
Path(destination).parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(destination, "wb") as f:
|
||||
f.write(response.content)
|
||||
else:
|
||||
log.debug(f"File {destination} was already downloaded")
|
||||
return destination
|
||||
|
||||
# def rich_download_file(url, destination):
|
||||
# if not Path(destination).is_file():
|
||||
# progress = Progress(auto_refresh = True)
|
||||
# # Disabling proxy speeds up requests time
|
||||
# session = requests.Session()
|
||||
# session.trust_env = False
|
||||
# Path(destination).parent.mkdir(parents=True, exist_ok=True)
|
||||
# with open(destination, 'wb') as f:
|
||||
# response = session.get(url, stream=True)
|
||||
# total = response.headers.get('content-length')
|
||||
# task_id = progress.add_task(url)
|
||||
# if total is None:
|
||||
# f.write(response.content)
|
||||
# else:
|
||||
# progress.update(task_id, total=int(total))
|
||||
# for data in response.iter_content(chunk_size=4096):
|
||||
# f.write(data)
|
||||
# progress.update(task_id, advance=len(data))
|
||||
# progress.update(task_id, completed =int(total))
|
||||
# else:
|
||||
# log.debug(f"File {destination} was already downloaded")
|
||||
# return destination
|
||||
|
||||
class notion_page_loaded(object):
|
||||
"""An expectation for checking that a notion page has loaded.
|
||||
"""
|
||||
def __call__(self, driver):
|
||||
notion_presence = len(driver.find_elements_by_class_name("notion-presence-container"))
|
||||
loading_spinners = len(driver.find_elements_by_class_name("loading-spinner"));
|
||||
# embed_ghosts = len(driver.find_elements_by_css_selector("div[embed-ghost]"));
|
||||
log.debug(f"Waiting for page content to load (presence container: {notion_presence}, loaders: {loading_spinners} )")
|
||||
if (notion_presence and not loading_spinners):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
class toggle_block_has_opened(object):
|
||||
"""An expectation for checking that a notion toggle block has been opened.
|
||||
It does so by checking if the div hosting the content has enough children,
|
||||
and the abscence of the loading spinner.
|
||||
"""
|
||||
def __init__(self, toggle_block):
|
||||
self.toggle_block = toggle_block
|
||||
|
||||
def __call__(self, driver):
|
||||
toggle_content = self.toggle_block.find_element_by_css_selector("div:not([style]")
|
||||
if (toggle_content):
|
||||
content_children = len(toggle_content.find_elements_by_tag_name("div"))
|
||||
is_loading = len(self.toggle_block.find_elements_by_class_name("loading-spinner"));
|
||||
log.debug(f"Waiting for toggle block to load ({content_children} children so far and {is_loading} loaders)")
|
||||
if (content_children > 3 and not is_loading):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
class Parser():
|
||||
def __init__(self, dist_folder):
|
||||
self.dist_folder = Path(dist_folder)
|
||||
self.driver = self.init_chromedriver()
|
||||
|
||||
# create output path if it doesn't exists
|
||||
self.dist_folder.mkdir(parents=True, exist_ok=True)
|
||||
log.info(f"Setting output path to {self.dist_folder}")
|
||||
|
||||
def init_chromedriver(self):
|
||||
log.info("Initialising chrome driver")
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument("--headless")
|
||||
chrome_options.add_argument("window-size=1920,1080")
|
||||
chrome_options.add_argument("--log-level=3");
|
||||
chrome_options.add_argument("--silent");
|
||||
chrome_options.add_argument("--disable-logging")
|
||||
# removes the 'DevTools listening' log message
|
||||
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
|
||||
return webdriver.Chrome(
|
||||
executable_path=str(Path.cwd() / "bin" / "chromedriver.exe"),
|
||||
service_log_path=str(Path.cwd() / "webdrive.log"),
|
||||
options=chrome_options)
|
||||
|
||||
def parse_page(self, url, processed_pages, index = None):
|
||||
# if this is the first page being parse, set it as the index.html
|
||||
if (not index):
|
||||
index = url;
|
||||
|
||||
log.info(f'Parsing page {url}')
|
||||
self.driver.get(url)
|
||||
try:
|
||||
# WebDriverWait(self.driver, 10).until(notion_page_loaded())
|
||||
WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'notion-presence-container')))
|
||||
except TimeoutException as ex:
|
||||
log.error("Timeout waiting for page content to load")
|
||||
return
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
# expands all the toggle block in the page to make their content visible
|
||||
# we hook up our custom toggle logic afterwards
|
||||
def open_toggle_blocks(exclude = []):
|
||||
opened_toggles = exclude;
|
||||
toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block")
|
||||
log.debug(f"Opening {len(toggle_blocks)} new toggle blocks in the page")
|
||||
for toggle_block in toggle_blocks:
|
||||
if (not toggle_block in opened_toggles):
|
||||
toggle_button = toggle_block.find_element_by_css_selector("div[role=button]")
|
||||
# check if the toggle is already open by the direction of its arrow
|
||||
is_toggled = "(180deg)" in (toggle_button.find_element_by_tag_name("svg").get_attribute("style"))
|
||||
if (not is_toggled):
|
||||
# click on it, then wait until all elements are displayed
|
||||
toggle_button.click()
|
||||
try:
|
||||
WebDriverWait(self.driver, 10).until(toggle_block_has_opened(toggle_block))
|
||||
except TimeoutException as ex:
|
||||
log.warn("Timeout waiting for toggle block to open")
|
||||
opened_toggles.append(toggle_block)
|
||||
# after all toggles have been opened, check the page again to see if
|
||||
# any toggle block had nested toggle blocks inside them
|
||||
new_toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block")
|
||||
if (len(new_toggle_blocks) > len(toggle_blocks)):
|
||||
# if so, run the function again
|
||||
open_toggle_blocks(opened_toggles)
|
||||
|
||||
open_toggle_blocks()
|
||||
|
||||
# creates soup from the page to start parsing
|
||||
soup = BeautifulSoup(self.driver.page_source, "lxml")
|
||||
|
||||
# process eventual embedded iframes
|
||||
for embed in soup.select('div[embed-ghost]'):
|
||||
iframe = embed.find('iframe');
|
||||
iframe_parent = iframe.parent
|
||||
iframe_parent['class'] = iframe_parent.get('class', []) + ['loconotion-iframe-target']
|
||||
iframe_parent['loconotion-iframe-src'] = iframe['src']
|
||||
|
||||
# process meta tags
|
||||
def set_meta_tag(prop_name, prop_value, content):
|
||||
tag = soup.find("meta", attrs = { prop_name : prop_value})
|
||||
if (tag):
|
||||
log.debug(f"Setting meta tag {prop_value} to {content}")
|
||||
if (content): tag["content"] = content
|
||||
else: tag.decompose();
|
||||
else:
|
||||
log.warn(f"Meta tag with {prop_name}: {prop_value} was not found")
|
||||
|
||||
set_meta_tag("name", "description", None)
|
||||
set_meta_tag("name", "twitter:card", None)
|
||||
set_meta_tag("name", "twitter:site", None)
|
||||
set_meta_tag("name", "twitter:title", None)
|
||||
set_meta_tag("name", "twitter:description", None)
|
||||
set_meta_tag("name", "twitter:image", None)
|
||||
set_meta_tag("name", "twitter:url", None)
|
||||
set_meta_tag("property", "og:site_name", None)
|
||||
set_meta_tag("property", "og:type", None)
|
||||
set_meta_tag("property", "og:url", None)
|
||||
set_meta_tag("property", "og:title", None)
|
||||
set_meta_tag("property", "og:description", None)
|
||||
set_meta_tag("property", "og:image", None)
|
||||
set_meta_tag("name", "apple-itunes-app", None)
|
||||
|
||||
# process images
|
||||
cache_images = True
|
||||
for img in soup.findAll('img'):
|
||||
if img.has_attr('src'):
|
||||
if (cache_images):
|
||||
img_src = img['src']
|
||||
|
||||
# if the path starts with /, it's one of notion's predefined images
|
||||
if (img['src'].startswith('/')):
|
||||
# notion's images urls are in a weird format, need to sanitize them
|
||||
img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0]
|
||||
|
||||
# generate an hashed id for the image filename based the url,
|
||||
# so we avoid re-downloading images we have already downloaded,
|
||||
# and figure out the filename from the url (I know, just this once)
|
||||
img_extension = Path(urllib.parse.urlparse(img_src).path).suffix
|
||||
img_name = hashlib.sha1(str.encode(img_src)).hexdigest();
|
||||
img_file = img_name + img_extension
|
||||
|
||||
download_file(img_src, self.dist_folder / img_file)
|
||||
img['src'] = img_file
|
||||
else:
|
||||
if (img['src'].startswith('/')):
|
||||
img['src'] = "https://www.notion.so" + img['src']
|
||||
|
||||
# process stylesheets
|
||||
for link in soup.findAll('link', rel="stylesheet"):
|
||||
if link.has_attr('href') and link['href'].startswith('/'):
|
||||
# we don't need the vendors stylesheet
|
||||
if ("vendors~" in link['href']):
|
||||
continue
|
||||
css_file = link['href'].replace('/', '')
|
||||
saved_css_file = download_file('https://www.notion.so' + link['href'], self.dist_folder / css_file)
|
||||
with open(saved_css_file, 'rb') as f:
|
||||
stylesheet = cssutils.parseString(f.read())
|
||||
# open the stylesheet and check for any font-face rule,
|
||||
for rule in stylesheet.cssRules:
|
||||
if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE:
|
||||
# if any are found, download the font file
|
||||
font_file = rule.style['src'].split("url(/")[-1].split(") format")[0]
|
||||
download_file(f'https://www.notion.so/{font_file}', self.dist_folder / font_file)
|
||||
link['href'] = css_file
|
||||
|
||||
# remove scripts and other tags we don't want / need
|
||||
for unwanted in soup.findAll(['script', 'iframe']):
|
||||
unwanted.decompose();
|
||||
for intercom_div in soup.findAll('div',{'class':'intercom-lightweight-app'}):
|
||||
intercom_div.decompose();
|
||||
for overlay_div in soup.findAll('div',{'class':'notion-overlay-container'}):
|
||||
overlay_div.decompose();
|
||||
|
||||
# add our custom logic to all toggle blocks
|
||||
for toggle_block in soup.findAll('div',{'class':'notion-toggle-block'}):
|
||||
toggle_id = uuid.uuid4()
|
||||
toggle_button = toggle_block.select_one('div[role=button]')
|
||||
toggle_content = toggle_block.find('div', {'class': None, 'style': ''})
|
||||
if (toggle_button and toggle_content):
|
||||
# add a custom class to the toggle button and content, plus a custom attribute
|
||||
# sharing a unique uiid so we can hook them up with some custom js logic later
|
||||
toggle_button['class'] = toggle_block.get('class', []) + ['loconotion-toggle-button']
|
||||
toggle_content['class'] = toggle_content.get('class', []) + ['loconotion-toggle-content']
|
||||
toggle_content.attrs['loconotion-toggle-id'] = toggle_button.attrs['loconotion-toggle-id'] = toggle_id
|
||||
|
||||
# embed custom google font
|
||||
custom_font = None
|
||||
if (custom_font):
|
||||
custom_font_stylesheet_stylesheet = soup.new_tag("link")
|
||||
custom_font_stylesheet.attrs["rel"] = "stylesheet"
|
||||
custom_font_stylesheet.attrs["href"] = f"https://fonts.googleapis.com/css2?family={custom_font}:wght@500;600;700&display=swap"
|
||||
soup.head.insert(-1, custom_font_stylesheet)
|
||||
for app in soup.findAll('div',{'class':'notion-app-inner'}):
|
||||
style = cssutils.parseStyle(app['style']);
|
||||
style['font-family'] = f"'{custom_font}', {style['font-family']}"
|
||||
app['style'] = style.cssText
|
||||
|
||||
# append custom stylesheet
|
||||
custom_css = soup.new_tag("link")
|
||||
custom_css.attrs["rel"] = "stylesheet"
|
||||
custom_css.attrs["href"] = "loconotion.css"
|
||||
soup.head.insert(-1, custom_css)
|
||||
|
||||
# append custom script
|
||||
custom_script = soup.new_tag("script")
|
||||
custom_script.attrs["type"] = "text/javascript"
|
||||
custom_script.attrs["src"] = "loconotion.js"
|
||||
soup.body.insert(-1, custom_script)
|
||||
|
||||
# find sub-pages and clean slugs / links
|
||||
sub_pages = [];
|
||||
for a in soup.findAll('a'):
|
||||
if a['href'].startswith('/'):
|
||||
sub_page_href = 'https://www.notion.so' + a['href']
|
||||
sub_pages.append(sub_page_href)
|
||||
a['href'] = get_clean_slug(sub_page_href) if sub_page_href != index else "index.html"
|
||||
log.debug(f"Found link to page {a['href']}")
|
||||
|
||||
# exports the parsed page
|
||||
html_str = str(soup)
|
||||
html_file = get_clean_slug(url) if url != index else "index.html"
|
||||
log.info(f"Exporting page {url} as {html_file}")
|
||||
with open(self.dist_folder / html_file, "wb") as f:
|
||||
f.write(html_str.encode('utf-8').strip())
|
||||
processed_pages.append(url)
|
||||
|
||||
# parse sub-pages
|
||||
for sub_page in sub_pages:
|
||||
if not sub_page in processed_pages:
|
||||
self.parse_page(sub_page, processed_pages, index)
|
||||
|
||||
def run(self, url):
|
||||
processed_pages = []
|
||||
self.parse_page(url, processed_pages)
|
||||
|
||||
# copy custom assets to dist folder
|
||||
shutil.copyfile("loconotion.css", self.dist_folder / "loconotion.css");
|
||||
shutil.copyfile("loconotion.js", self.dist_folder / "loconotion.js");
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
url = "https://www.notion.so/leoncvlt-f276385bf5ce42969497f0b03aef907e"
|
||||
output_folder = Path("dist") / get_clean_slug(url, extension = False)
|
||||
parser = Parser(output_folder)
|
||||
parser.run(url)
|
||||
# parser.run("https://www.notion.so/A-Notion-Page-03c403f4fdc94cc1b315b9469a8950ef")
|
||||
# parser.run("https://www.notion.so/Media-be1a5c3e1c9640a0ab9ba0ba9b67e6a5")
|
||||
# parser.run('https://www.notion.so/leoncvlt-f276385bf5ce42969497f0b03aef907e')
|
||||
except KeyboardInterrupt:
|
||||
log.error('Interrupted by user')
|
||||
try:
|
||||
sys.exit(0)
|
||||
except SystemExit:
|
||||
os._exit(0)
|
Loading…
Reference in New Issue
Block a user