mirror of
https://github.com/leoncvlt/loconotion.git
synced 2024-08-30 18:12:12 +00:00
Merge pull request #92 from flipio-ru/refactor
Refactor for readability, unit testing and community contibutions
This commit is contained in:
commit
46c77076ab
5
CONTRIBUTING.md
Normal file
5
CONTRIBUTING.md
Normal file
@ -0,0 +1,5 @@
|
||||
Before submitting your PR for review:
|
||||
|
||||
1. Please add unit tests checking your code.
|
||||
2. Please merge the master branch into your branch and merge conflicts.
|
||||
3. Please check if all the current tests in the `/tests` directory pass successfully. We use the `pytest` framework.
|
@ -1,148 +1,14 @@
|
||||
from notionparser import Parser
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import urllib.parse
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
log = logging.getLogger("loconotion")
|
||||
|
||||
try:
|
||||
import requests
|
||||
import toml
|
||||
except ModuleNotFoundError as error:
|
||||
log.critical(
|
||||
f"ModuleNotFoundError: {error}. have your installed the requirements?")
|
||||
sys.exit()
|
||||
|
||||
|
||||
def main():
|
||||
# set up argument parser
|
||||
argparser = argparse.ArgumentParser(
|
||||
description="Generate static websites from Notion.so pages"
|
||||
)
|
||||
argparser.add_argument(
|
||||
"target",
|
||||
help="The config file containing the site properties, or the url"
|
||||
" of the Notion.so page to generate the site from",
|
||||
)
|
||||
argparser.add_argument(
|
||||
"--chromedriver",
|
||||
help="Use a specific chromedriver executable instead of the auto-installing one",
|
||||
)
|
||||
argparser.add_argument(
|
||||
"--single-page", action="store_true", help="Only parse the first page, then stop"
|
||||
)
|
||||
argparser.add_argument(
|
||||
"--dark-theme",
|
||||
action="store_true",
|
||||
help="Use dark themed version of the target Notion.so page",
|
||||
)
|
||||
argparser.add_argument(
|
||||
"--timeout",
|
||||
type=int,
|
||||
default=5,
|
||||
help="Time in seconds to wait for the loading of lazy-loaded dynamic elements (default 5)."
|
||||
" If content from the page seems to be missing, try increasing this value",
|
||||
)
|
||||
argparser.add_argument(
|
||||
"--clean",
|
||||
action="store_true",
|
||||
help="Delete all previously cached files for the site before generating it",
|
||||
)
|
||||
argparser.add_argument(
|
||||
"--clean-css",
|
||||
action="store_true",
|
||||
help="Delete previously cached .css files for the site before generating it",
|
||||
)
|
||||
argparser.add_argument(
|
||||
"--clean-js",
|
||||
action="store_true",
|
||||
help="Delete previously cached .js files for the site before generating it",
|
||||
)
|
||||
argparser.add_argument(
|
||||
"--non-headless",
|
||||
action="store_true",
|
||||
help="Run chromedriver in non-headless mode",
|
||||
)
|
||||
argparser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="Increase output log verbosity"
|
||||
)
|
||||
args = argparser.parse_args()
|
||||
|
||||
# set up some pretty logs
|
||||
log = logging.getLogger("loconotion")
|
||||
log.setLevel(logging.INFO if not args.verbose else logging.DEBUG)
|
||||
log_screen_handler = logging.StreamHandler(stream=sys.stdout)
|
||||
log.addHandler(log_screen_handler)
|
||||
log.propagate = False
|
||||
try:
|
||||
import colorama
|
||||
import copy
|
||||
|
||||
LOG_COLORS = {
|
||||
logging.DEBUG: colorama.Fore.GREEN,
|
||||
logging.INFO: colorama.Fore.BLUE,
|
||||
logging.WARNING: colorama.Fore.YELLOW,
|
||||
logging.ERROR: colorama.Fore.RED,
|
||||
logging.CRITICAL: colorama.Back.RED,
|
||||
}
|
||||
|
||||
class ColorFormatter(logging.Formatter):
|
||||
def format(self, record, *args, **kwargs):
|
||||
# if the corresponding logger has children, they may receive modified
|
||||
# record, so we want to keep it intact
|
||||
new_record = copy.copy(record)
|
||||
if new_record.levelno in LOG_COLORS:
|
||||
new_record.levelname = "{color_begin}{level}{color_end}".format(
|
||||
level=new_record.levelname,
|
||||
color_begin=LOG_COLORS[new_record.levelno],
|
||||
color_end=colorama.Style.RESET_ALL,
|
||||
)
|
||||
return super(ColorFormatter, self).format(new_record, *args, **kwargs)
|
||||
|
||||
log_screen_handler.setFormatter(
|
||||
ColorFormatter(
|
||||
fmt="%(asctime)s %(levelname)-8s %(message)s",
|
||||
datefmt="{color_begin}[%H:%M:%S]{color_end}".format(
|
||||
color_begin=colorama.Style.DIM, color_end=colorama.Style.RESET_ALL
|
||||
),
|
||||
)
|
||||
)
|
||||
except ModuleNotFoundError as identifier:
|
||||
pass
|
||||
|
||||
# initialise and run the website parser
|
||||
try:
|
||||
if urllib.parse.urlparse(args.target).scheme:
|
||||
try:
|
||||
response = requests.get(args.target)
|
||||
if "notion.so" in args.target or "notion.site" in args.target:
|
||||
log.info("Initialising parser with simple page url")
|
||||
config = {"page": args.target}
|
||||
Parser(config=config, args=vars(args))
|
||||
else:
|
||||
log.critical(f"{args.target} is not a notion.so page")
|
||||
except requests.ConnectionError as exception:
|
||||
log.critical(f"Connection error")
|
||||
else:
|
||||
if Path(args.target).is_file():
|
||||
with open(args.target, encoding="utf-8") as f:
|
||||
parsed_config = toml.loads(f.read())
|
||||
log.info(f"Initialising parser with configuration file")
|
||||
log.debug(parsed_config)
|
||||
Parser(config=parsed_config, args=vars(args))
|
||||
else:
|
||||
log.critical(f"Config file {args.target} does not exists")
|
||||
except FileNotFoundError as e:
|
||||
log.critical(f"FileNotFoundError: {e}")
|
||||
sys.exit(0)
|
||||
|
||||
import modules.main as main
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
args = main.get_args()
|
||||
log = main.setup_logging(args)
|
||||
parser = main.init_parser(args, log)
|
||||
parser.run()
|
||||
except KeyboardInterrupt:
|
||||
log.critical("Interrupted by user")
|
||||
try:
|
||||
|
0
loconotion/modules/__init__.py
Normal file
0
loconotion/modules/__init__.py
Normal file
152
loconotion/modules/main.py
Normal file
152
loconotion/modules/main.py
Normal file
@ -0,0 +1,152 @@
|
||||
import argparse
|
||||
import copy
|
||||
import logging
|
||||
import sys
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
|
||||
from .notionparser import Parser
|
||||
|
||||
log = logging.getLogger("loconotion")
|
||||
|
||||
try:
|
||||
import colorama
|
||||
import requests
|
||||
import toml
|
||||
|
||||
except ModuleNotFoundError as error:
|
||||
log.critical(f"ModuleNotFoundError: {error}. Have you installed the requirements?")
|
||||
sys.exit()
|
||||
|
||||
|
||||
def get_args():
|
||||
# set up argument parser and return parsed args
|
||||
argparser = argparse.ArgumentParser(
|
||||
description="Generate static websites from Notion.so pages"
|
||||
)
|
||||
argparser.add_argument(
|
||||
"target",
|
||||
help="The config file containing the site properties, or the url"
|
||||
" of the Notion.so page to generate the site from",
|
||||
)
|
||||
argparser.add_argument(
|
||||
"--chromedriver",
|
||||
help="Use a specific chromedriver executable instead of the auto-installing one",
|
||||
)
|
||||
argparser.add_argument(
|
||||
"--single-page",
|
||||
action="store_true",
|
||||
help="Only parse the first page, then stop",
|
||||
)
|
||||
argparser.add_argument(
|
||||
"--dark-theme",
|
||||
action="store_true",
|
||||
help="Use dark themed version of the target Notion.so page",
|
||||
)
|
||||
argparser.add_argument(
|
||||
"--timeout",
|
||||
type=int,
|
||||
default=5,
|
||||
help="Time in seconds to wait for the loading of lazy-loaded dynamic elements (default 5)."
|
||||
" If content from the page seems to be missing, try increasing this value",
|
||||
)
|
||||
argparser.add_argument(
|
||||
"--clean",
|
||||
action="store_true",
|
||||
help="Delete all previously cached files for the site before generating it",
|
||||
)
|
||||
argparser.add_argument(
|
||||
"--clean-css",
|
||||
action="store_true",
|
||||
help="Delete previously cached .css files for the site before generating it",
|
||||
)
|
||||
argparser.add_argument(
|
||||
"--clean-js",
|
||||
action="store_true",
|
||||
help="Delete previously cached .js files for the site before generating it",
|
||||
)
|
||||
argparser.add_argument(
|
||||
"--non-headless",
|
||||
action="store_true",
|
||||
help="Run chromedriver in non-headless mode",
|
||||
)
|
||||
argparser.add_argument(
|
||||
"-v", "--verbose", action="store_true", help="Increase output log verbosity"
|
||||
)
|
||||
return argparser.parse_args()
|
||||
|
||||
|
||||
def setup_logging(args):
|
||||
# set up some pretty logs
|
||||
log = logging.getLogger("loconotion")
|
||||
log.setLevel(logging.INFO if not args.verbose else logging.DEBUG)
|
||||
log_screen_handler = logging.StreamHandler(stream=sys.stdout)
|
||||
log.addHandler(log_screen_handler)
|
||||
log.propagate = False
|
||||
try:
|
||||
LOG_COLORS = {
|
||||
logging.DEBUG: colorama.Fore.GREEN,
|
||||
logging.INFO: colorama.Fore.BLUE,
|
||||
logging.WARNING: colorama.Fore.YELLOW,
|
||||
logging.ERROR: colorama.Fore.RED,
|
||||
logging.CRITICAL: colorama.Back.RED,
|
||||
}
|
||||
|
||||
class ColorFormatter(logging.Formatter):
|
||||
def format(self, record, *args, **kwargs):
|
||||
# if the corresponding logger has children, they may receive modified
|
||||
# record, so we want to keep it intact
|
||||
new_record = copy.copy(record)
|
||||
if new_record.levelno in LOG_COLORS:
|
||||
new_record.levelname = "{color_begin}{level}{color_end}".format(
|
||||
level=new_record.levelname,
|
||||
color_begin=LOG_COLORS[new_record.levelno],
|
||||
color_end=colorama.Style.RESET_ALL,
|
||||
)
|
||||
return super(ColorFormatter, self).format(new_record, *args, **kwargs)
|
||||
|
||||
log_screen_handler.setFormatter(
|
||||
ColorFormatter(
|
||||
fmt="%(asctime)s %(levelname)-8s %(message)s",
|
||||
datefmt="{color_begin}[%H:%M:%S]{color_end}".format(
|
||||
color_begin=colorama.Style.DIM, color_end=colorama.Style.RESET_ALL
|
||||
),
|
||||
)
|
||||
)
|
||||
except ModuleNotFoundError as identifier:
|
||||
pass
|
||||
|
||||
return log
|
||||
|
||||
|
||||
def init_parser(args, log):
|
||||
# initialise the website parser
|
||||
try:
|
||||
if urllib.parse.urlparse(args.target).scheme:
|
||||
try:
|
||||
requests.get(args.target)
|
||||
except requests.ConnectionError as exception:
|
||||
log.critical("Connection error")
|
||||
|
||||
if "notion.so" in args.target or "notion.site" in args.target:
|
||||
log.info("Initialising parser with simple page url")
|
||||
config = {"page": args.target}
|
||||
parser = Parser(config=config, args=vars(args))
|
||||
else:
|
||||
log.critical(f"{args.target} is not a notion.so page")
|
||||
|
||||
elif Path(args.target).is_file():
|
||||
with open(args.target, encoding="utf-8") as f:
|
||||
parsed_config = toml.loads(f.read())
|
||||
log.info("Initialising parser with configuration file")
|
||||
log.debug(parsed_config)
|
||||
parser = Parser(config=parsed_config, args=vars(args))
|
||||
|
||||
else:
|
||||
log.critical(f"Config file {args.target} does not exist")
|
||||
|
||||
except FileNotFoundError as e:
|
||||
log.critical(f"FileNotFoundError: {e}")
|
||||
sys.exit(0)
|
||||
|
||||
return parser
|
@ -1,54 +1,53 @@
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import time
|
||||
import uuid
|
||||
import logging
|
||||
import re
|
||||
import glob
|
||||
import mimetypes
|
||||
import urllib.parse
|
||||
import hashlib
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
log = logging.getLogger(f"loconotion.{__name__}")
|
||||
|
||||
try:
|
||||
import chromedriver_autoinstaller
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import cssutils
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
cssutils.log.setLevel(logging.CRITICAL) # removes warning logs from cssutils
|
||||
except ModuleNotFoundError as error:
|
||||
log.critical(f"ModuleNotFoundError: {error}. have your installed the requirements?")
|
||||
sys.exit()
|
||||
|
||||
from conditions import toggle_block_has_opened, notion_page_loaded
|
||||
from .conditions import notion_page_loaded, toggle_block_has_opened
|
||||
|
||||
|
||||
class Parser:
|
||||
def __init__(self, config={}, args={}):
|
||||
self.config = config
|
||||
self.args = args
|
||||
url = self.config.get("page", None)
|
||||
if not url:
|
||||
index_url = self.config.get("page", None)
|
||||
if not index_url:
|
||||
log.critical(
|
||||
"No initial page url specified. If passing a configuration file,"
|
||||
" make sure it contains a 'page' key with the url of the notion.so"
|
||||
" make sure it contains a 'page' key with the url of the notion.site"
|
||||
" page to parse"
|
||||
)
|
||||
return
|
||||
|
||||
# get the site name from the config, or make it up by cleaning the target page's slug
|
||||
site_name = self.config.get("name", self.get_page_slug(url, extension=False))
|
||||
site_name = self.config.get("name", self.get_page_slug(index_url, extension=False))
|
||||
|
||||
self.index_url = index_url
|
||||
|
||||
# set the output folder based on the site name
|
||||
self.dist_folder = Path(config.get("output", Path("dist") / site_name))
|
||||
@ -80,9 +79,10 @@ class Parser:
|
||||
# create the output folder if necessary
|
||||
self.dist_folder.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# initialize chromedriver and start parsing
|
||||
# initialize chromedriver
|
||||
self.driver = self.init_chromedriver()
|
||||
self.run(url)
|
||||
|
||||
self.starting_url = index_url
|
||||
|
||||
def get_page_config(self, token):
|
||||
# starts by grabbing the gobal site configuration table, if exists
|
||||
@ -183,8 +183,10 @@ class Parser:
|
||||
content_type = response.headers.get("content-type")
|
||||
if content_type:
|
||||
file_extension = mimetypes.guess_extension(content_type)
|
||||
elif '%3f' in file_extension.lower():
|
||||
file_extension = re.split("%3f", file_extension, flags=re.IGNORECASE)[0]
|
||||
elif "%3f" in file_extension.lower():
|
||||
file_extension = re.split(
|
||||
"%3f", file_extension, flags=re.IGNORECASE
|
||||
)[0]
|
||||
destination = destination.with_suffix(file_extension)
|
||||
|
||||
Path(destination).parent.mkdir(parents=True, exist_ok=True)
|
||||
@ -230,8 +232,8 @@ class Parser:
|
||||
if not self.args.get("non_headless", False):
|
||||
chrome_options.add_argument("--headless")
|
||||
chrome_options.add_argument("window-size=1920,1080")
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
chrome_options.add_argument("--log-level=3")
|
||||
chrome_options.add_argument("--silent")
|
||||
chrome_options.add_argument("--disable-logging")
|
||||
@ -243,32 +245,76 @@ class Parser:
|
||||
options=chrome_options,
|
||||
)
|
||||
|
||||
def parse_page(self, url, processed_pages={}, index=None):
|
||||
def parse_page(self, url: str):
|
||||
"""Parse page at url and write it to file, then recursively parse all subpages.
|
||||
|
||||
Args:
|
||||
url (str): URL of the page to parse.
|
||||
|
||||
After the page at `url` has been parsed, calls itself recursively for every subpage
|
||||
it has discovered.
|
||||
"""
|
||||
log.info(f"Parsing page '{url}'")
|
||||
log.debug(f"Using page config: {self.get_page_config(url)}")
|
||||
|
||||
try:
|
||||
self.load(url)
|
||||
if not index:
|
||||
# if this is the first page being parse, set it as the index.html
|
||||
index = url
|
||||
# if dark theme is enabled, set local storage item and re-load the page
|
||||
if self.args.get("dark_theme", True):
|
||||
log.debug(f"Dark theme is enabled")
|
||||
self.driver.execute_script("window.localStorage.setItem('theme','{\"mode\":\"dark\"}');")
|
||||
self.load(url)
|
||||
except TimeoutException as ex:
|
||||
self.load_correct_theme(url)
|
||||
except TimeoutException:
|
||||
log.critical(
|
||||
"Timeout waiting for page content to load, or no content found."
|
||||
" Are you sure the page is set to public?"
|
||||
)
|
||||
return
|
||||
|
||||
self.scroll_to_the_bottom()
|
||||
|
||||
# open the toggle blocks in the page
|
||||
self.open_toggle_blocks(self.args["timeout"])
|
||||
|
||||
# creates soup from the page to start parsing
|
||||
soup = BeautifulSoup(self.driver.page_source, "html.parser")
|
||||
|
||||
self.clean_up(soup)
|
||||
self.set_custom_meta_tags(url, soup)
|
||||
self.process_images_and_emojis(soup)
|
||||
self.process_stylesheets(soup)
|
||||
self.add_toggle_custom_logic(soup)
|
||||
self.process_table_views(soup)
|
||||
self.embed_custom_fonts(url, soup)
|
||||
|
||||
# inject any custom elements to the page
|
||||
custom_injects = self.get_page_config(url).get("inject", {})
|
||||
self.inject_custom_tags("head", soup, custom_injects)
|
||||
self.inject_custom_tags("body", soup, custom_injects)
|
||||
|
||||
self.inject_loconotion_script_and_css(soup)
|
||||
|
||||
hrefDomain = f'{url.split("notion.site")[0]}notion.site'
|
||||
log.info(f"Got the domain as {hrefDomain}")
|
||||
|
||||
subpages = self.find_subpages(url, soup, hrefDomain)
|
||||
self.export_parsed_page(url, soup)
|
||||
self.parse_subpages(subpages)
|
||||
|
||||
def load_correct_theme(self, url):
|
||||
self.load(url)
|
||||
|
||||
# if dark theme is enabled, set local storage item and re-load the page
|
||||
if self.args.get("dark_theme", True):
|
||||
log.debug("Dark theme is enabled")
|
||||
self.driver.execute_script(
|
||||
"window.localStorage.setItem('theme','{\"mode\":\"dark\"}');"
|
||||
)
|
||||
self.load(url)
|
||||
|
||||
# light theme is on by default
|
||||
# enable dark mode based on https://fruitionsite.com/ dark mode hack
|
||||
if self.config.get('theme') == 'dark':
|
||||
self.driver.execute_script("__console.environment.ThemeStore.setState({ mode: 'dark' });")
|
||||
if self.config.get("theme") == "dark":
|
||||
self.driver.execute_script(
|
||||
"__console.environment.ThemeStore.setState({ mode: 'dark' });"
|
||||
)
|
||||
|
||||
def scroll_to_the_bottom(self):
|
||||
# scroll at the bottom of the notion-scroller element to load all elements
|
||||
# continue once there are no changes in height after a timeout
|
||||
# don't do this if the page has a calendar databse on it or it will load forever
|
||||
@ -290,54 +336,53 @@ class Parser:
|
||||
break
|
||||
last_height = new_height
|
||||
|
||||
# function to expand all the toggle block in the page to make their content visible
|
||||
# so we can hook up our custom toggle logic afterwards
|
||||
def open_toggle_blocks(timeout, exclude=[]):
|
||||
opened_toggles = exclude
|
||||
toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block")
|
||||
log.debug(f"Opening {len(toggle_blocks)} new toggle blocks in the page")
|
||||
for toggle_block in toggle_blocks:
|
||||
if not toggle_block in opened_toggles:
|
||||
toggle_button = toggle_block.find_element_by_css_selector(
|
||||
"div[role=button]"
|
||||
)
|
||||
# check if the toggle is already open by the direction of its arrow
|
||||
is_toggled = "(180deg)" in (
|
||||
toggle_button.find_element_by_tag_name("svg").get_attribute(
|
||||
"style"
|
||||
def open_toggle_blocks(self, timeout: int, exclude=[]):
|
||||
"""Expand all the toggle block in the page to make their content visible
|
||||
|
||||
Args:
|
||||
timeout (int): timeout in seconds
|
||||
exclude (list[Webelement], optional): toggles to exclude. Defaults to [].
|
||||
|
||||
Opening toggles is needed for hooking up our custom toggle logic afterwards.
|
||||
"""
|
||||
opened_toggles = exclude
|
||||
toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block")
|
||||
log.debug(f"Opening {len(toggle_blocks)} new toggle blocks in the page")
|
||||
for toggle_block in toggle_blocks:
|
||||
if toggle_block not in opened_toggles:
|
||||
toggle_button = toggle_block.find_element_by_css_selector(
|
||||
"div[role=button]"
|
||||
)
|
||||
# check if the toggle is already open by the direction of its arrow
|
||||
is_toggled = "(180deg)" in (
|
||||
toggle_button.find_element_by_tag_name("svg").get_attribute("style")
|
||||
)
|
||||
if not is_toggled:
|
||||
# click on it, then wait until all elements are displayed
|
||||
self.driver.execute_script("arguments[0].click();", toggle_button)
|
||||
try:
|
||||
WebDriverWait(self.driver, timeout).until(
|
||||
toggle_block_has_opened(toggle_block)
|
||||
)
|
||||
)
|
||||
if not is_toggled:
|
||||
# click on it, then wait until all elements are displayed
|
||||
self.driver.execute_script("arguments[0].click();", toggle_button)
|
||||
try:
|
||||
WebDriverWait(self.driver, timeout).until(
|
||||
toggle_block_has_opened(toggle_block)
|
||||
)
|
||||
except TimeoutException as ex:
|
||||
log.warning(
|
||||
"Timeout waiting for toggle block to open."
|
||||
" Likely it's already open, but doesn't hurt to check."
|
||||
)
|
||||
except Exception as exception:
|
||||
log.error(f"Error trying to open a toggle block: {exception}")
|
||||
opened_toggles.append(toggle_block)
|
||||
except TimeoutException as ex:
|
||||
log.warning(
|
||||
"Timeout waiting for toggle block to open."
|
||||
" Likely it's already open, but doesn't hurt to check."
|
||||
)
|
||||
except Exception as exception:
|
||||
log.error(f"Error trying to open a toggle block: {exception}")
|
||||
opened_toggles.append(toggle_block)
|
||||
|
||||
# after all toggles have been opened, check the page again to see if
|
||||
# any toggle block had nested toggle blocks inside them
|
||||
new_toggle_blocks = self.driver.find_elements_by_class_name(
|
||||
"notion-toggle-block"
|
||||
)
|
||||
if len(new_toggle_blocks) > len(toggle_blocks):
|
||||
# if so, run the function again
|
||||
open_toggle_blocks(timeout, opened_toggles)
|
||||
|
||||
# open the toggle blocks in the page
|
||||
open_toggle_blocks(self.args["timeout"])
|
||||
|
||||
# creates soup from the page to start parsing
|
||||
soup = BeautifulSoup(self.driver.page_source, "html.parser")
|
||||
# after all toggles have been opened, check the page again to see if
|
||||
# any toggle block had nested toggle blocks inside them
|
||||
new_toggle_blocks = self.driver.find_elements_by_class_name(
|
||||
"notion-toggle-block"
|
||||
)
|
||||
if len(new_toggle_blocks) > len(toggle_blocks):
|
||||
# if so, run the function again
|
||||
self.open_toggle_blocks(timeout, opened_toggles)
|
||||
|
||||
def clean_up(self, soup):
|
||||
# remove scripts and other tags we don't want / need
|
||||
for unwanted in soup.findAll("script"):
|
||||
unwanted.decompose()
|
||||
@ -351,7 +396,9 @@ class Parser:
|
||||
vendors_css.decompose()
|
||||
|
||||
# collection selectors (List, Gallery, etc.) don't work, so remove them
|
||||
for collection_selector in soup.findAll("div", {"class": "notion-collection-view-select"}):
|
||||
for collection_selector in soup.findAll(
|
||||
"div", {"class": "notion-collection-view-select"}
|
||||
):
|
||||
collection_selector.decompose()
|
||||
|
||||
# clean up the default notion meta tags
|
||||
@ -380,6 +427,7 @@ class Parser:
|
||||
if unwanted_og_tag:
|
||||
unwanted_og_tag.decompose()
|
||||
|
||||
def set_custom_meta_tags(self, url, soup):
|
||||
# set custom meta tags
|
||||
custom_meta_tags = self.get_page_config(url).get("meta", [])
|
||||
for custom_meta_tag in custom_meta_tags:
|
||||
@ -389,15 +437,16 @@ class Parser:
|
||||
log.debug(f"Adding meta tag {str(tag)}")
|
||||
soup.head.append(tag)
|
||||
|
||||
def process_images_and_emojis(self, soup):
|
||||
# process images & emojis
|
||||
cache_images = True
|
||||
for img in soup.findAll("img"):
|
||||
if img.has_attr("src"):
|
||||
if cache_images and not "data:image" in img["src"]:
|
||||
if cache_images and "data:image" not in img["src"]:
|
||||
img_src = img["src"]
|
||||
# if the path starts with /, it's one of notion's predefined images
|
||||
if img["src"].startswith("/"):
|
||||
img_src = "https://www.notion.so" + img["src"]
|
||||
img_src = f'https://www.notion.so{img["src"]}'
|
||||
# notion's own default images urls are in a weird format, need to sanitize them
|
||||
# img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0]
|
||||
# if (not '.amazonaws' in img_src):
|
||||
@ -405,35 +454,40 @@ class Parser:
|
||||
|
||||
cached_image = self.cache_file(img_src)
|
||||
img["src"] = cached_image
|
||||
else:
|
||||
if img["src"].startswith("/"):
|
||||
img["src"] = "https://www.notion.so" + img["src"]
|
||||
elif img["src"].startswith("/"):
|
||||
img["src"] = f'https://www.notion.so{img["src"]}'
|
||||
|
||||
# on emoji images, cache their sprite sheet and re-set their background url
|
||||
if img.has_attr("class") and "notion-emoji" in img["class"]:
|
||||
style = cssutils.parseStyle(img["style"])
|
||||
spritesheet = style["background"]
|
||||
spritesheet_url = spritesheet[
|
||||
spritesheet.find("(") + 1: spritesheet.find(")")
|
||||
]
|
||||
spritesheet.find("(") + 1 : spritesheet.find(")")
|
||||
]
|
||||
cached_spritesheet_url = self.cache_file(
|
||||
"https://www.notion.so" + spritesheet_url
|
||||
f"https://www.notion.so{spritesheet_url}"
|
||||
)
|
||||
|
||||
style["background"] = spritesheet.replace(
|
||||
spritesheet_url, str(cached_spritesheet_url)
|
||||
)
|
||||
img["style"] = style.cssText
|
||||
|
||||
def process_stylesheets(self, soup):
|
||||
# process stylesheets
|
||||
for link in soup.findAll("link", rel="stylesheet"):
|
||||
if link.has_attr("href") and link["href"].startswith("/"):
|
||||
# we don't need the vendors stylesheet
|
||||
if "vendors~" in link["href"]:
|
||||
continue
|
||||
cached_css_file = self.cache_file("https://www.notion.so" + link["href"])
|
||||
cached_css_file = self.cache_file(
|
||||
f'https://www.notion.so{link["href"]}'
|
||||
)
|
||||
# files in the css file might be reference with a relative path,
|
||||
# so store the path of the current css file
|
||||
parent_css_path = os.path.split(urllib.parse.urlparse(link["href"]).path)[0]
|
||||
parent_css_path = os.path.split(
|
||||
urllib.parse.urlparse(link["href"]).path
|
||||
)[0]
|
||||
# open the locally saved file
|
||||
with open(self.dist_folder / cached_css_file, "rb+") as f:
|
||||
stylesheet = cssutils.parseString(f.read())
|
||||
@ -446,17 +500,28 @@ class Parser:
|
||||
rule.style["src"].split("url(")[-1].split(")")[0]
|
||||
)
|
||||
# assemble the url given the current css path
|
||||
font_url = "/".join(p.strip("/") for p in ["https://www.notion.so", parent_css_path, font_file] if p.strip("/"))
|
||||
font_url = "/".join(
|
||||
p.strip("/")
|
||||
for p in [
|
||||
"https://www.notion.so",
|
||||
parent_css_path,
|
||||
font_file,
|
||||
]
|
||||
if p.strip("/")
|
||||
)
|
||||
# don't hash the font files filenames, rather get filename only
|
||||
cached_font_file = self.cache_file(font_url, Path(font_file).name)
|
||||
cached_font_file = self.cache_file(
|
||||
font_url, Path(font_file).name
|
||||
)
|
||||
rule.style["src"] = f"url({cached_font_file})"
|
||||
# commit stylesheet edits to file
|
||||
f.seek(0)
|
||||
f.truncate()
|
||||
f.write(stylesheet.cssText)
|
||||
|
||||
|
||||
link["href"] = str(cached_css_file)
|
||||
|
||||
def add_toggle_custom_logic(self, soup):
|
||||
# add our custom logic to all toggle blocks
|
||||
for toggle_block in soup.findAll("div", {"class": "notion-toggle-block"}):
|
||||
toggle_id = uuid.uuid4()
|
||||
@ -476,21 +541,46 @@ class Parser:
|
||||
"loconotion-toggle-id"
|
||||
] = toggle_id
|
||||
|
||||
def process_table_views(self, soup):
|
||||
# if there are any table views in the page, add links to the title rows
|
||||
# the link to the row item is equal to its data-block-id without dashes
|
||||
for table_view in soup.findAll("div", {"class": "notion-table-view"}):
|
||||
for table_row in table_view.findAll(
|
||||
"div", {"class": "notion-collection-item"}
|
||||
"div", {"class": "notion-collection-item"}
|
||||
):
|
||||
table_row_block_id = table_row["data-block-id"]
|
||||
table_row_href = "/" + table_row_block_id.replace("-", "")
|
||||
row_target_span = table_row.find("span")
|
||||
row_target_span["style"] = row_target_span["style"].replace("pointer-events: none;","")
|
||||
row_target_span["style"] = row_target_span["style"].replace(
|
||||
"pointer-events: none;", ""
|
||||
)
|
||||
row_link_wrapper = soup.new_tag(
|
||||
"a", attrs={"href": table_row_href, "style": "cursor: pointer; color: inherit; text-decoration: none; fill: inherit;"}
|
||||
"a",
|
||||
attrs={
|
||||
"href": table_row_href,
|
||||
"style": "cursor: pointer; color: inherit; text-decoration: none; fill: inherit;",
|
||||
},
|
||||
)
|
||||
row_target_span.wrap(row_link_wrapper)
|
||||
|
||||
def embed_custom_fonts(self, url, soup):
|
||||
if not (custom_fonts := self.get_page_config(url).get("fonts", {})):
|
||||
return
|
||||
|
||||
# append a stylesheet importing the google font for each unique font
|
||||
unique_custom_fonts = set(custom_fonts.values())
|
||||
for font in unique_custom_fonts:
|
||||
if font:
|
||||
google_fonts_embed_name = font.replace(" ", "+")
|
||||
font_href = f"https://fonts.googleapis.com/css2?family={google_fonts_embed_name}:wght@500;600;700&display=swap"
|
||||
custom_font_stylesheet = soup.new_tag(
|
||||
"link", rel="stylesheet", href=font_href
|
||||
)
|
||||
soup.head.append(custom_font_stylesheet)
|
||||
|
||||
# go through each custom font, and add a css rule overriding the font-family
|
||||
# to the font override stylesheet targetting the appropriate selector
|
||||
font_override_stylesheet = soup.new_tag("style", type="text/css")
|
||||
# embed custom google font(s)
|
||||
fonts_selectors = {
|
||||
"site": "div:not(.notion-code-block)",
|
||||
@ -502,66 +592,53 @@ class Parser:
|
||||
"body": ".notion-scroller",
|
||||
"code": ".notion-code-block *",
|
||||
}
|
||||
custom_fonts = self.get_page_config(url).get("fonts", {})
|
||||
if custom_fonts:
|
||||
# append a stylesheet importing the google font for each unique font
|
||||
unique_custom_fonts = set(custom_fonts.values())
|
||||
for font in unique_custom_fonts:
|
||||
if font:
|
||||
google_fonts_embed_name = font.replace(" ", "+")
|
||||
font_href = f"https://fonts.googleapis.com/css2?family={google_fonts_embed_name}:wght@500;600;700&display=swap"
|
||||
custom_font_stylesheet = soup.new_tag(
|
||||
"link", rel="stylesheet", href=font_href
|
||||
)
|
||||
soup.head.append(custom_font_stylesheet)
|
||||
|
||||
# go through each custom font, and add a css rule overriding the font-family
|
||||
# to the font override stylesheet targetting the appropriate selector
|
||||
font_override_stylesheet = soup.new_tag("style", type="text/css")
|
||||
for target, custom_font in custom_fonts.items():
|
||||
if custom_font and not target == "site":
|
||||
log.debug(f"Setting {target} font-family to {custom_font}")
|
||||
font_override_stylesheet.append(
|
||||
fonts_selectors[target]
|
||||
+ " {font-family:"
|
||||
+ custom_font
|
||||
+ " !important} "
|
||||
)
|
||||
site_font = custom_fonts.get("site", None)
|
||||
# process global site font last to more granular settings can override it
|
||||
if site_font:
|
||||
log.debug(f"Setting global site font-family to {site_font}"),
|
||||
for target, custom_font in custom_fonts.items():
|
||||
if custom_font and target != "site":
|
||||
log.debug(f"Setting {target} font-family to {custom_font}")
|
||||
font_override_stylesheet.append(
|
||||
fonts_selectors["site"] + " {font-family:" + site_font + "} "
|
||||
fonts_selectors[target]
|
||||
+ " {font-family:"
|
||||
+ custom_font
|
||||
+ " !important} "
|
||||
)
|
||||
# finally append the font overrides stylesheets to the page
|
||||
soup.head.append(font_override_stylesheet)
|
||||
|
||||
# inject any custom elements to the page
|
||||
custom_injects = self.get_page_config(url).get("inject", {})
|
||||
site_font = custom_fonts.get("site", None)
|
||||
if site_font:
|
||||
log.debug(f"Setting global site font-family to {site_font}"),
|
||||
font_override_stylesheet.append(
|
||||
fonts_selectors["site"] + " {font-family:" + site_font + "} "
|
||||
)
|
||||
|
||||
def injects_custom_tags(section):
|
||||
section_custom_injects = custom_injects.get(section, {})
|
||||
for tag, elements in section_custom_injects.items():
|
||||
for element in elements:
|
||||
injected_tag = soup.new_tag(tag)
|
||||
for attr, value in element.items():
|
||||
injected_tag[attr] = value
|
||||
# if the value refers to a file, copy it to the dist folder
|
||||
if attr.lower() == "href" or attr.lower() == "src":
|
||||
log.debug(f"Copying injected file '{value}'")
|
||||
cached_custom_file = self.cache_file(
|
||||
(Path.cwd() / value.strip("/"))
|
||||
)
|
||||
# destination = (self.dist_folder / source.name)
|
||||
# shutil.copyfile(source, destination)
|
||||
injected_tag[attr] = str(cached_custom_file) # source.name
|
||||
log.debug(f"Injecting <{section}> tag: {str(injected_tag)}")
|
||||
soup.find(section).append(injected_tag)
|
||||
# finally append the font overrides stylesheets to the page
|
||||
soup.head.append(font_override_stylesheet)
|
||||
|
||||
injects_custom_tags("head")
|
||||
injects_custom_tags("body")
|
||||
def inject_custom_tags(self, section: str, soup, custom_injects: dict):
|
||||
"""Inject custom tags to the given section.
|
||||
|
||||
Args:
|
||||
section (str): Section / tag name to insert into.
|
||||
soup (BeautifulSoup): a BeautifulSoup element holding the whole page.
|
||||
custom_injects (dict): description of custom tags to inject.
|
||||
"""
|
||||
section_custom_injects = custom_injects.get(section, {})
|
||||
for tag, elements in section_custom_injects.items():
|
||||
for element in elements:
|
||||
injected_tag = soup.new_tag(tag)
|
||||
for attr, value in element.items():
|
||||
injected_tag[attr] = value
|
||||
# if the value refers to a file, copy it to the dist folder
|
||||
if attr.lower() in ["href", "src"]:
|
||||
log.debug(f"Copying injected file '{value}'")
|
||||
cached_custom_file = self.cache_file(
|
||||
(Path.cwd() / value.strip("/"))
|
||||
)
|
||||
# destination = (self.dist_folder / source.name)
|
||||
# shutil.copyfile(source, destination)
|
||||
injected_tag[attr] = str(cached_custom_file) # source.name
|
||||
log.debug(f"Injecting <{section}> tag: {injected_tag}")
|
||||
soup.find(section).append(injected_tag)
|
||||
|
||||
def inject_loconotion_script_and_css(self, soup):
|
||||
# inject loconotion's custom stylesheet and script
|
||||
loconotion_custom_css = self.cache_file(Path("bundles/loconotion.css"))
|
||||
custom_css = soup.new_tag(
|
||||
@ -574,29 +651,31 @@ class Parser:
|
||||
)
|
||||
soup.body.insert(-1, custom_script)
|
||||
|
||||
hrefDomain = url.split('notion.site')[0] + 'notion.site'
|
||||
log.info(f"Got the domain as {hrefDomain}")
|
||||
|
||||
def find_subpages(self, url, soup, hrefDomain):
|
||||
# find sub-pages and clean slugs / links
|
||||
sub_pages = []
|
||||
subpages = []
|
||||
parse_links = not self.get_page_config(url).get("no-links", False)
|
||||
for a in soup.find_all('a', href=True):
|
||||
for a in soup.find_all("a", href=True):
|
||||
sub_page_href = a["href"]
|
||||
if sub_page_href.startswith("/"):
|
||||
sub_page_href = hrefDomain + '/'+ a["href"].split('/')[len(a["href"].split('/'))-1]
|
||||
sub_page_href = (
|
||||
f'{hrefDomain}/{a["href"].split("/")[len(a["href"].split("/"))-1]}'
|
||||
)
|
||||
log.info(f"Got this as href {sub_page_href}")
|
||||
if sub_page_href.startswith(hrefDomain):
|
||||
if parse_links or not len(a.find_parents("div", class_="notion-scroller")):
|
||||
if parse_links or not len(
|
||||
a.find_parents("div", class_="notion-scroller")
|
||||
):
|
||||
# if the link is an anchor link,
|
||||
# check if the page hasn't already been parsed
|
||||
if "#" in sub_page_href:
|
||||
sub_page_href_tokens = sub_page_href.split("#")
|
||||
sub_page_href = sub_page_href_tokens[0]
|
||||
a["href"] = "#" + sub_page_href_tokens[-1]
|
||||
a["href"] = f"#{sub_page_href_tokens[-1]}"
|
||||
a["class"] = a.get("class", []) + ["loconotion-anchor-link"]
|
||||
if (
|
||||
sub_page_href in processed_pages.keys()
|
||||
or sub_page_href in sub_pages
|
||||
sub_page_href in self.processed_pages.keys()
|
||||
or sub_page_href in subpages
|
||||
):
|
||||
log.debug(
|
||||
f"Original page for anchor link {sub_page_href}"
|
||||
@ -606,10 +685,10 @@ class Parser:
|
||||
else:
|
||||
a["href"] = (
|
||||
self.get_page_slug(sub_page_href)
|
||||
if sub_page_href != index
|
||||
if sub_page_href != self.index_url
|
||||
else "index.html"
|
||||
)
|
||||
sub_pages.append(sub_page_href)
|
||||
subpages.append(sub_page_href)
|
||||
log.debug(f"Found link to page {a['href']}")
|
||||
else:
|
||||
# if the page is set not to follow any links, strip the href
|
||||
@ -619,17 +698,18 @@ class Parser:
|
||||
del a["href"]
|
||||
a.name = "span"
|
||||
# remove pointer cursor styling on the link and all children
|
||||
for child in ([a] + a.find_all()):
|
||||
if (child.has_attr("style")):
|
||||
style = cssutils.parseStyle(child['style'])
|
||||
style['cursor'] = "default"
|
||||
child['style'] = style.cssText
|
||||
for child in [a] + a.find_all():
|
||||
if child.has_attr("style"):
|
||||
style = cssutils.parseStyle(child["style"])
|
||||
style["cursor"] = "default"
|
||||
child["style"] = style.cssText
|
||||
return subpages
|
||||
|
||||
|
||||
def export_parsed_page(self, url, soup):
|
||||
# exports the parsed page
|
||||
html_str = str(soup)
|
||||
html_file = self.get_page_slug(url) if url != index else "index.html"
|
||||
if html_file in processed_pages.values():
|
||||
html_file = self.get_page_slug(url) if url != self.index_url else "index.html"
|
||||
if html_file in self.processed_pages.values():
|
||||
log.error(
|
||||
f"Found duplicate pages with slug '{html_file}' - previous one will be"
|
||||
" overwritten. Make sure that your notion pages names or custom slugs"
|
||||
@ -638,35 +718,31 @@ class Parser:
|
||||
log.info(f"Exporting page '{url}' as '{html_file}'")
|
||||
with open(self.dist_folder / html_file, "wb") as f:
|
||||
f.write(html_str.encode("utf-8").strip())
|
||||
processed_pages[url] = html_file
|
||||
self.processed_pages[url] = html_file
|
||||
|
||||
def parse_subpages(self, subpages):
|
||||
# parse sub-pages
|
||||
if sub_pages and not self.args.get("single_page", False):
|
||||
if processed_pages:
|
||||
log.debug(f"Pages processed so far: {len(processed_pages)}")
|
||||
for sub_page in sub_pages:
|
||||
if not sub_page in processed_pages.keys():
|
||||
self.parse_page(
|
||||
sub_page, processed_pages=processed_pages, index=index
|
||||
)
|
||||
|
||||
# we're all done!
|
||||
return processed_pages
|
||||
if subpages and not self.args.get("single_page", False):
|
||||
if self.processed_pages:
|
||||
log.debug(f"Pages processed so far: {len(self.processed_pages)}")
|
||||
for sub_page in subpages:
|
||||
if sub_page not in self.processed_pages.keys():
|
||||
self.parse_page(sub_page)
|
||||
|
||||
def load(self, url):
|
||||
self.driver.get(url)
|
||||
WebDriverWait(self.driver, 60).until(notion_page_loaded())
|
||||
|
||||
def run(self, url):
|
||||
def run(self):
|
||||
start_time = time.time()
|
||||
tot_processed_pages = self.parse_page(url)
|
||||
self.processed_pages = {}
|
||||
self.parse_page(self.starting_url)
|
||||
elapsed_time = time.time() - start_time
|
||||
formatted_time = "{:02d}:{:02d}:{:02d}".format(
|
||||
int(elapsed_time // 3600),
|
||||
int(elapsed_time % 3600 // 60),
|
||||
int(elapsed_time % 60),
|
||||
tot_processed_pages,
|
||||
)
|
||||
log.info(
|
||||
f"Finished!\n\nProcessed {len(tot_processed_pages)} pages in {formatted_time}"
|
||||
f"Finished!\n\nProcessed {len(self.processed_pages)} pages in {formatted_time}"
|
||||
)
|
0
loconotion/tests/__init__.py
Normal file
0
loconotion/tests/__init__.py
Normal file
12
loconotion/tests/test_parser.py
Normal file
12
loconotion/tests/test_parser.py
Normal file
@ -0,0 +1,12 @@
|
||||
import pytest
|
||||
from modules.notionparser import Parser
|
||||
|
||||
def test_parse_sample_page():
|
||||
config={"page": "https://www.notion.so/Loconotion-Example-Page-03c403f4fdc94cc1b315b9469a8950ef"}
|
||||
args = {"timeout": 10, "single_page": True}
|
||||
parser = Parser(config, args)
|
||||
parser.processed_pages = {}
|
||||
|
||||
parser.parse_page(parser.starting_url)
|
||||
|
||||
assert parser.starting_url in parser.processed_pages
|
Loading…
Reference in New Issue
Block a user