mirror of
https://github.com/leoncvlt/loconotion.git
synced 2024-08-30 18:12:12 +00:00
Split parse_page
into several methods and some minor refactorings
Makes `parse_page` more readable
This commit is contained in:
parent
727121201d
commit
9b8ca4d771
@ -256,28 +256,63 @@ class Parser:
|
|||||||
log.info(f"Parsing page '{url}'")
|
log.info(f"Parsing page '{url}'")
|
||||||
log.debug(f"Using page config: {self.get_page_config(url)}")
|
log.debug(f"Using page config: {self.get_page_config(url)}")
|
||||||
|
|
||||||
|
if not index: # if this is the first page being parsed
|
||||||
|
index = url # set it as the index.html
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.load(url)
|
self.load_correct_theme(url)
|
||||||
if not index:
|
except TimeoutException:
|
||||||
# if this is the first page being parsed, set it as the index.html
|
|
||||||
index = url
|
|
||||||
# if dark theme is enabled, set local storage item and re-load the page
|
|
||||||
if self.args.get("dark_theme", True):
|
|
||||||
log.debug(f"Dark theme is enabled")
|
|
||||||
self.driver.execute_script("window.localStorage.setItem('theme','{\"mode\":\"dark\"}');")
|
|
||||||
self.load(url)
|
|
||||||
except TimeoutException as ex:
|
|
||||||
log.critical(
|
log.critical(
|
||||||
"Timeout waiting for page content to load, or no content found."
|
"Timeout waiting for page content to load, or no content found."
|
||||||
" Are you sure the page is set to public?"
|
" Are you sure the page is set to public?"
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
self.scroll_to_the_bottom()
|
||||||
|
|
||||||
|
# open the toggle blocks in the page
|
||||||
|
self.open_toggle_blocks(self.args["timeout"])
|
||||||
|
|
||||||
|
# creates soup from the page to start parsing
|
||||||
|
soup = BeautifulSoup(self.driver.page_source, "html.parser")
|
||||||
|
|
||||||
|
self.clean_up(soup)
|
||||||
|
self.set_custom_meta_tags(url, soup)
|
||||||
|
self.process_images_and_emojis(soup)
|
||||||
|
self.process_stylesheets(soup)
|
||||||
|
self.add_toggle_custom_logic(soup)
|
||||||
|
self.process_table_views(soup)
|
||||||
|
self.embed_custom_fonts(url, soup)
|
||||||
|
|
||||||
|
# inject any custom elements to the page
|
||||||
|
custom_injects = self.get_page_config(url).get("inject", {})
|
||||||
|
self.inject_custom_tags("head", soup, custom_injects)
|
||||||
|
self.inject_custom_tags("body", soup, custom_injects)
|
||||||
|
|
||||||
|
self.inject_loconotion_script_and_css(soup)
|
||||||
|
|
||||||
|
hrefDomain = f'{url.split("notion.site")[0]}notion.site'
|
||||||
|
log.info(f"Got the domain as {hrefDomain}")
|
||||||
|
|
||||||
|
subpages = self.find_subpages(url, index, soup, hrefDomain)
|
||||||
|
self.export_parsed_page(url, index, soup)
|
||||||
|
self.parse_subpages(index, subpages)
|
||||||
|
|
||||||
|
def load_correct_theme(self, url):
|
||||||
|
self.load(url)
|
||||||
|
|
||||||
|
# if dark theme is enabled, set local storage item and re-load the page
|
||||||
|
if self.args.get("dark_theme", True):
|
||||||
|
log.debug("Dark theme is enabled")
|
||||||
|
self.driver.execute_script("window.localStorage.setItem('theme','{\"mode\":\"dark\"}');")
|
||||||
|
self.load(url)
|
||||||
|
|
||||||
# light theme is on by default
|
# light theme is on by default
|
||||||
# enable dark mode based on https://fruitionsite.com/ dark mode hack
|
# enable dark mode based on https://fruitionsite.com/ dark mode hack
|
||||||
if self.config.get('theme') == 'dark':
|
if self.config.get('theme') == 'dark':
|
||||||
self.driver.execute_script("__console.environment.ThemeStore.setState({ mode: 'dark' });")
|
self.driver.execute_script("__console.environment.ThemeStore.setState({ mode: 'dark' });")
|
||||||
|
\
|
||||||
|
def scroll_to_the_bottom(self):
|
||||||
# scroll at the bottom of the notion-scroller element to load all elements
|
# scroll at the bottom of the notion-scroller element to load all elements
|
||||||
# continue once there are no changes in height after a timeout
|
# continue once there are no changes in height after a timeout
|
||||||
# don't do this if the page has a calendar databse on it or it will load forever
|
# don't do this if the page has a calendar databse on it or it will load forever
|
||||||
@ -299,302 +334,6 @@ class Parser:
|
|||||||
break
|
break
|
||||||
last_height = new_height
|
last_height = new_height
|
||||||
|
|
||||||
# open the toggle blocks in the page
|
|
||||||
self.open_toggle_blocks(self.args["timeout"])
|
|
||||||
|
|
||||||
# creates soup from the page to start parsing
|
|
||||||
soup = BeautifulSoup(self.driver.page_source, "html.parser")
|
|
||||||
|
|
||||||
# remove scripts and other tags we don't want / need
|
|
||||||
for unwanted in soup.findAll("script"):
|
|
||||||
unwanted.decompose()
|
|
||||||
for intercom_frame in soup.findAll("iframe", {"id": "intercom-frame"}):
|
|
||||||
intercom_frame.decompose()
|
|
||||||
for intercom_div in soup.findAll("div", {"class": "intercom-lightweight-app"}):
|
|
||||||
intercom_div.decompose()
|
|
||||||
for overlay_div in soup.findAll("div", {"class": "notion-overlay-container"}):
|
|
||||||
overlay_div.decompose()
|
|
||||||
for vendors_css in soup.find_all("link", href=lambda x: x and "vendors~" in x):
|
|
||||||
vendors_css.decompose()
|
|
||||||
|
|
||||||
# collection selectors (List, Gallery, etc.) don't work, so remove them
|
|
||||||
for collection_selector in soup.findAll("div", {"class": "notion-collection-view-select"}):
|
|
||||||
collection_selector.decompose()
|
|
||||||
|
|
||||||
# clean up the default notion meta tags
|
|
||||||
for tag in [
|
|
||||||
"description",
|
|
||||||
"twitter:card",
|
|
||||||
"twitter:site",
|
|
||||||
"twitter:title",
|
|
||||||
"twitter:description",
|
|
||||||
"twitter:image",
|
|
||||||
"twitter:url",
|
|
||||||
"apple-itunes-app",
|
|
||||||
]:
|
|
||||||
unwanted_tag = soup.find("meta", attrs={"name": tag})
|
|
||||||
if unwanted_tag:
|
|
||||||
unwanted_tag.decompose()
|
|
||||||
for tag in [
|
|
||||||
"og:site_name",
|
|
||||||
"og:type",
|
|
||||||
"og:url",
|
|
||||||
"og:title",
|
|
||||||
"og:description",
|
|
||||||
"og:image",
|
|
||||||
]:
|
|
||||||
unwanted_og_tag = soup.find("meta", attrs={"property": tag})
|
|
||||||
if unwanted_og_tag:
|
|
||||||
unwanted_og_tag.decompose()
|
|
||||||
|
|
||||||
# set custom meta tags
|
|
||||||
custom_meta_tags = self.get_page_config(url).get("meta", [])
|
|
||||||
for custom_meta_tag in custom_meta_tags:
|
|
||||||
tag = soup.new_tag("meta")
|
|
||||||
for attr, value in custom_meta_tag.items():
|
|
||||||
tag.attrs[attr] = value
|
|
||||||
log.debug(f"Adding meta tag {str(tag)}")
|
|
||||||
soup.head.append(tag)
|
|
||||||
|
|
||||||
# process images & emojis
|
|
||||||
cache_images = True
|
|
||||||
for img in soup.findAll("img"):
|
|
||||||
if img.has_attr("src"):
|
|
||||||
if cache_images and not "data:image" in img["src"]:
|
|
||||||
img_src = img["src"]
|
|
||||||
# if the path starts with /, it's one of notion's predefined images
|
|
||||||
if img["src"].startswith("/"):
|
|
||||||
img_src = "https://www.notion.so" + img["src"]
|
|
||||||
# notion's own default images urls are in a weird format, need to sanitize them
|
|
||||||
# img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0]
|
|
||||||
# if (not '.amazonaws' in img_src):
|
|
||||||
# img_src = urllib.parse.unquote(img_src)
|
|
||||||
|
|
||||||
cached_image = self.cache_file(img_src)
|
|
||||||
img["src"] = cached_image
|
|
||||||
else:
|
|
||||||
if img["src"].startswith("/"):
|
|
||||||
img["src"] = "https://www.notion.so" + img["src"]
|
|
||||||
|
|
||||||
# on emoji images, cache their sprite sheet and re-set their background url
|
|
||||||
if img.has_attr("class") and "notion-emoji" in img["class"]:
|
|
||||||
style = cssutils.parseStyle(img["style"])
|
|
||||||
spritesheet = style["background"]
|
|
||||||
spritesheet_url = spritesheet[
|
|
||||||
spritesheet.find("(") + 1: spritesheet.find(")")
|
|
||||||
]
|
|
||||||
cached_spritesheet_url = self.cache_file(
|
|
||||||
"https://www.notion.so" + spritesheet_url
|
|
||||||
)
|
|
||||||
style["background"] = spritesheet.replace(
|
|
||||||
spritesheet_url, str(cached_spritesheet_url)
|
|
||||||
)
|
|
||||||
img["style"] = style.cssText
|
|
||||||
|
|
||||||
# process stylesheets
|
|
||||||
for link in soup.findAll("link", rel="stylesheet"):
|
|
||||||
if link.has_attr("href") and link["href"].startswith("/"):
|
|
||||||
# we don't need the vendors stylesheet
|
|
||||||
if "vendors~" in link["href"]:
|
|
||||||
continue
|
|
||||||
cached_css_file = self.cache_file("https://www.notion.so" + link["href"])
|
|
||||||
# files in the css file might be reference with a relative path,
|
|
||||||
# so store the path of the current css file
|
|
||||||
parent_css_path = os.path.split(urllib.parse.urlparse(link["href"]).path)[0]
|
|
||||||
# open the locally saved file
|
|
||||||
with open(self.dist_folder / cached_css_file, "rb+") as f:
|
|
||||||
stylesheet = cssutils.parseString(f.read())
|
|
||||||
# open the stylesheet and check for any font-face rule,
|
|
||||||
for rule in stylesheet.cssRules:
|
|
||||||
if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE:
|
|
||||||
# if any are found, download the font file
|
|
||||||
# TODO: maths fonts have fallback font sources
|
|
||||||
font_file = (
|
|
||||||
rule.style["src"].split("url(")[-1].split(")")[0]
|
|
||||||
)
|
|
||||||
# assemble the url given the current css path
|
|
||||||
font_url = "/".join(p.strip("/") for p in ["https://www.notion.so", parent_css_path, font_file] if p.strip("/"))
|
|
||||||
# don't hash the font files filenames, rather get filename only
|
|
||||||
cached_font_file = self.cache_file(font_url, Path(font_file).name)
|
|
||||||
rule.style["src"] = f"url({cached_font_file})"
|
|
||||||
# commit stylesheet edits to file
|
|
||||||
f.seek(0)
|
|
||||||
f.truncate()
|
|
||||||
f.write(stylesheet.cssText)
|
|
||||||
|
|
||||||
link["href"] = str(cached_css_file)
|
|
||||||
|
|
||||||
# add our custom logic to all toggle blocks
|
|
||||||
for toggle_block in soup.findAll("div", {"class": "notion-toggle-block"}):
|
|
||||||
toggle_id = uuid.uuid4()
|
|
||||||
toggle_button = toggle_block.select_one("div[role=button]")
|
|
||||||
toggle_content = toggle_block.find("div", {"class": None, "style": ""})
|
|
||||||
if toggle_button and toggle_content:
|
|
||||||
# add a custom class to the toggle button and content,
|
|
||||||
# plus a custom attribute sharing a unique uiid so
|
|
||||||
# we can hook them up with some custom js logic later
|
|
||||||
toggle_button["class"] = toggle_block.get("class", []) + [
|
|
||||||
"loconotion-toggle-button"
|
|
||||||
]
|
|
||||||
toggle_content["class"] = toggle_content.get("class", []) + [
|
|
||||||
"loconotion-toggle-content"
|
|
||||||
]
|
|
||||||
toggle_content.attrs["loconotion-toggle-id"] = toggle_button.attrs[
|
|
||||||
"loconotion-toggle-id"
|
|
||||||
] = toggle_id
|
|
||||||
|
|
||||||
# if there are any table views in the page, add links to the title rows
|
|
||||||
# the link to the row item is equal to its data-block-id without dashes
|
|
||||||
for table_view in soup.findAll("div", {"class": "notion-table-view"}):
|
|
||||||
for table_row in table_view.findAll(
|
|
||||||
"div", {"class": "notion-collection-item"}
|
|
||||||
):
|
|
||||||
table_row_block_id = table_row["data-block-id"]
|
|
||||||
table_row_href = "/" + table_row_block_id.replace("-", "")
|
|
||||||
row_target_span = table_row.find("span")
|
|
||||||
row_target_span["style"] = row_target_span["style"].replace("pointer-events: none;","")
|
|
||||||
row_link_wrapper = soup.new_tag(
|
|
||||||
"a", attrs={"href": table_row_href, "style": "cursor: pointer; color: inherit; text-decoration: none; fill: inherit;"}
|
|
||||||
)
|
|
||||||
row_target_span.wrap(row_link_wrapper)
|
|
||||||
|
|
||||||
# embed custom google font(s)
|
|
||||||
fonts_selectors = {
|
|
||||||
"site": "div:not(.notion-code-block)",
|
|
||||||
"navbar": ".notion-topbar div",
|
|
||||||
"title": ".notion-page-block > div, .notion-collection_view_page-block > div[data-root]",
|
|
||||||
"h1": ".notion-header-block div, notion-page-content > notion-collection_view-block > div:first-child div",
|
|
||||||
"h2": ".notion-sub_header-block div",
|
|
||||||
"h3": ".notion-sub_sub_header-block div",
|
|
||||||
"body": ".notion-scroller",
|
|
||||||
"code": ".notion-code-block *",
|
|
||||||
}
|
|
||||||
custom_fonts = self.get_page_config(url).get("fonts", {})
|
|
||||||
if custom_fonts:
|
|
||||||
# append a stylesheet importing the google font for each unique font
|
|
||||||
unique_custom_fonts = set(custom_fonts.values())
|
|
||||||
for font in unique_custom_fonts:
|
|
||||||
if font:
|
|
||||||
google_fonts_embed_name = font.replace(" ", "+")
|
|
||||||
font_href = f"https://fonts.googleapis.com/css2?family={google_fonts_embed_name}:wght@500;600;700&display=swap"
|
|
||||||
custom_font_stylesheet = soup.new_tag(
|
|
||||||
"link", rel="stylesheet", href=font_href
|
|
||||||
)
|
|
||||||
soup.head.append(custom_font_stylesheet)
|
|
||||||
|
|
||||||
# go through each custom font, and add a css rule overriding the font-family
|
|
||||||
# to the font override stylesheet targetting the appropriate selector
|
|
||||||
font_override_stylesheet = soup.new_tag("style", type="text/css")
|
|
||||||
for target, custom_font in custom_fonts.items():
|
|
||||||
if custom_font and not target == "site":
|
|
||||||
log.debug(f"Setting {target} font-family to {custom_font}")
|
|
||||||
font_override_stylesheet.append(
|
|
||||||
fonts_selectors[target]
|
|
||||||
+ " {font-family:"
|
|
||||||
+ custom_font
|
|
||||||
+ " !important} "
|
|
||||||
)
|
|
||||||
site_font = custom_fonts.get("site", None)
|
|
||||||
# process global site font last to more granular settings can override it
|
|
||||||
if site_font:
|
|
||||||
log.debug(f"Setting global site font-family to {site_font}"),
|
|
||||||
font_override_stylesheet.append(
|
|
||||||
fonts_selectors["site"] + " {font-family:" + site_font + "} "
|
|
||||||
)
|
|
||||||
# finally append the font overrides stylesheets to the page
|
|
||||||
soup.head.append(font_override_stylesheet)
|
|
||||||
|
|
||||||
# inject any custom elements to the page
|
|
||||||
custom_injects = self.get_page_config(url).get("inject", {})
|
|
||||||
|
|
||||||
self.inject_custom_tags("head", soup, custom_injects)
|
|
||||||
self.inject_custom_tags("body", soup, custom_injects)
|
|
||||||
|
|
||||||
# inject loconotion's custom stylesheet and script
|
|
||||||
loconotion_custom_css = self.cache_file(Path("bundles/loconotion.css"))
|
|
||||||
custom_css = soup.new_tag(
|
|
||||||
"link", rel="stylesheet", href=str(loconotion_custom_css)
|
|
||||||
)
|
|
||||||
soup.head.insert(-1, custom_css)
|
|
||||||
loconotion_custom_js = self.cache_file(Path("bundles/loconotion.js"))
|
|
||||||
custom_script = soup.new_tag(
|
|
||||||
"script", type="text/javascript", src=str(loconotion_custom_js)
|
|
||||||
)
|
|
||||||
soup.body.insert(-1, custom_script)
|
|
||||||
|
|
||||||
hrefDomain = url.split('notion.site')[0] + 'notion.site'
|
|
||||||
log.info(f"Got the domain as {hrefDomain}")
|
|
||||||
|
|
||||||
# find sub-pages and clean slugs / links
|
|
||||||
sub_pages = []
|
|
||||||
parse_links = not self.get_page_config(url).get("no-links", False)
|
|
||||||
for a in soup.find_all('a', href=True):
|
|
||||||
sub_page_href = a["href"]
|
|
||||||
if sub_page_href.startswith("/"):
|
|
||||||
sub_page_href = hrefDomain + '/'+ a["href"].split('/')[len(a["href"].split('/'))-1]
|
|
||||||
log.info(f"Got this as href {sub_page_href}")
|
|
||||||
if sub_page_href.startswith(hrefDomain):
|
|
||||||
if parse_links or not len(a.find_parents("div", class_="notion-scroller")):
|
|
||||||
# if the link is an anchor link,
|
|
||||||
# check if the page hasn't already been parsed
|
|
||||||
if "#" in sub_page_href:
|
|
||||||
sub_page_href_tokens = sub_page_href.split("#")
|
|
||||||
sub_page_href = sub_page_href_tokens[0]
|
|
||||||
a["href"] = "#" + sub_page_href_tokens[-1]
|
|
||||||
a["class"] = a.get("class", []) + ["loconotion-anchor-link"]
|
|
||||||
if (
|
|
||||||
sub_page_href in self.processed_pages.keys()
|
|
||||||
or sub_page_href in sub_pages
|
|
||||||
):
|
|
||||||
log.debug(
|
|
||||||
f"Original page for anchor link {sub_page_href}"
|
|
||||||
" already parsed / pending parsing, skipping"
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
a["href"] = (
|
|
||||||
self.get_page_slug(sub_page_href)
|
|
||||||
if sub_page_href != index
|
|
||||||
else "index.html"
|
|
||||||
)
|
|
||||||
sub_pages.append(sub_page_href)
|
|
||||||
log.debug(f"Found link to page {a['href']}")
|
|
||||||
else:
|
|
||||||
# if the page is set not to follow any links, strip the href
|
|
||||||
# do this only on children of .notion-scroller, we don't want
|
|
||||||
# to strip the links from the top nav bar
|
|
||||||
log.debug(f"Stripping link for {a['href']}")
|
|
||||||
del a["href"]
|
|
||||||
a.name = "span"
|
|
||||||
# remove pointer cursor styling on the link and all children
|
|
||||||
for child in ([a] + a.find_all()):
|
|
||||||
if (child.has_attr("style")):
|
|
||||||
style = cssutils.parseStyle(child['style'])
|
|
||||||
style['cursor'] = "default"
|
|
||||||
child['style'] = style.cssText
|
|
||||||
|
|
||||||
# exports the parsed page
|
|
||||||
html_str = str(soup)
|
|
||||||
html_file = self.get_page_slug(url) if url != index else "index.html"
|
|
||||||
if html_file in self.processed_pages.values():
|
|
||||||
log.error(
|
|
||||||
f"Found duplicate pages with slug '{html_file}' - previous one will be"
|
|
||||||
" overwritten. Make sure that your notion pages names or custom slugs"
|
|
||||||
" in the configuration files are unique"
|
|
||||||
)
|
|
||||||
log.info(f"Exporting page '{url}' as '{html_file}'")
|
|
||||||
with open(self.dist_folder / html_file, "wb") as f:
|
|
||||||
f.write(html_str.encode("utf-8").strip())
|
|
||||||
self.processed_pages[url] = html_file
|
|
||||||
|
|
||||||
# parse sub-pages
|
|
||||||
if sub_pages and not self.args.get("single_page", False):
|
|
||||||
if self.processed_pages:
|
|
||||||
log.debug(f"Pages processed so far: {len(self.processed_pages)}")
|
|
||||||
for sub_page in sub_pages:
|
|
||||||
if not sub_page in self.processed_pages.keys():
|
|
||||||
self.parse_page(sub_page, index=index)
|
|
||||||
|
|
||||||
def open_toggle_blocks(self, timeout: int, exclude=[]):
|
def open_toggle_blocks(self, timeout: int, exclude=[]):
|
||||||
"""Expand all the toggle block in the page to make their content visible
|
"""Expand all the toggle block in the page to make their content visible
|
||||||
|
|
||||||
@ -641,6 +380,214 @@ class Parser:
|
|||||||
# if so, run the function again
|
# if so, run the function again
|
||||||
self.open_toggle_blocks(timeout, opened_toggles)
|
self.open_toggle_blocks(timeout, opened_toggles)
|
||||||
|
|
||||||
|
def clean_up(self, soup):
|
||||||
|
# remove scripts and other tags we don't want / need
|
||||||
|
for unwanted in soup.findAll("script"):
|
||||||
|
unwanted.decompose()
|
||||||
|
for intercom_frame in soup.findAll("iframe", {"id": "intercom-frame"}):
|
||||||
|
intercom_frame.decompose()
|
||||||
|
for intercom_div in soup.findAll("div", {"class": "intercom-lightweight-app"}):
|
||||||
|
intercom_div.decompose()
|
||||||
|
for overlay_div in soup.findAll("div", {"class": "notion-overlay-container"}):
|
||||||
|
overlay_div.decompose()
|
||||||
|
for vendors_css in soup.find_all("link", href=lambda x: x and "vendors~" in x):
|
||||||
|
vendors_css.decompose()
|
||||||
|
|
||||||
|
# collection selectors (List, Gallery, etc.) don't work, so remove them
|
||||||
|
for collection_selector in soup.findAll("div", {"class": "notion-collection-view-select"}):
|
||||||
|
collection_selector.decompose()
|
||||||
|
|
||||||
|
# clean up the default notion meta tags
|
||||||
|
for tag in [
|
||||||
|
"description",
|
||||||
|
"twitter:card",
|
||||||
|
"twitter:site",
|
||||||
|
"twitter:title",
|
||||||
|
"twitter:description",
|
||||||
|
"twitter:image",
|
||||||
|
"twitter:url",
|
||||||
|
"apple-itunes-app",
|
||||||
|
]:
|
||||||
|
unwanted_tag = soup.find("meta", attrs={"name": tag})
|
||||||
|
if unwanted_tag:
|
||||||
|
unwanted_tag.decompose()
|
||||||
|
for tag in [
|
||||||
|
"og:site_name",
|
||||||
|
"og:type",
|
||||||
|
"og:url",
|
||||||
|
"og:title",
|
||||||
|
"og:description",
|
||||||
|
"og:image",
|
||||||
|
]:
|
||||||
|
unwanted_og_tag = soup.find("meta", attrs={"property": tag})
|
||||||
|
if unwanted_og_tag:
|
||||||
|
unwanted_og_tag.decompose()
|
||||||
|
|
||||||
|
def set_custom_meta_tags(self, url, soup):
|
||||||
|
# set custom meta tags
|
||||||
|
custom_meta_tags = self.get_page_config(url).get("meta", [])
|
||||||
|
for custom_meta_tag in custom_meta_tags:
|
||||||
|
tag = soup.new_tag("meta")
|
||||||
|
for attr, value in custom_meta_tag.items():
|
||||||
|
tag.attrs[attr] = value
|
||||||
|
log.debug(f"Adding meta tag {str(tag)}")
|
||||||
|
soup.head.append(tag)
|
||||||
|
|
||||||
|
def process_images_and_emojis(self, soup):
|
||||||
|
# process images & emojis
|
||||||
|
cache_images = True
|
||||||
|
for img in soup.findAll("img"):
|
||||||
|
if img.has_attr("src"):
|
||||||
|
if cache_images and "data:image" not in img["src"]:
|
||||||
|
img_src = img["src"]
|
||||||
|
# if the path starts with /, it's one of notion's predefined images
|
||||||
|
if img["src"].startswith("/"):
|
||||||
|
img_src = f'https://www.notion.so{img["src"]}'
|
||||||
|
# notion's own default images urls are in a weird format, need to sanitize them
|
||||||
|
# img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0]
|
||||||
|
# if (not '.amazonaws' in img_src):
|
||||||
|
# img_src = urllib.parse.unquote(img_src)
|
||||||
|
|
||||||
|
cached_image = self.cache_file(img_src)
|
||||||
|
img["src"] = cached_image
|
||||||
|
elif img["src"].startswith("/"):
|
||||||
|
img["src"] = f'https://www.notion.so{img["src"]}'
|
||||||
|
|
||||||
|
# on emoji images, cache their sprite sheet and re-set their background url
|
||||||
|
if img.has_attr("class") and "notion-emoji" in img["class"]:
|
||||||
|
style = cssutils.parseStyle(img["style"])
|
||||||
|
spritesheet = style["background"]
|
||||||
|
spritesheet_url = spritesheet[
|
||||||
|
spritesheet.find("(") + 1: spritesheet.find(")")
|
||||||
|
]
|
||||||
|
cached_spritesheet_url = self.cache_file(
|
||||||
|
f'https://www.notion.so{spritesheet_url}'
|
||||||
|
)
|
||||||
|
|
||||||
|
style["background"] = spritesheet.replace(
|
||||||
|
spritesheet_url, str(cached_spritesheet_url)
|
||||||
|
)
|
||||||
|
img["style"] = style.cssText
|
||||||
|
|
||||||
|
def process_stylesheets(self, soup):
|
||||||
|
# process stylesheets
|
||||||
|
for link in soup.findAll("link", rel="stylesheet"):
|
||||||
|
if link.has_attr("href") and link["href"].startswith("/"):
|
||||||
|
# we don't need the vendors stylesheet
|
||||||
|
if "vendors~" in link["href"]:
|
||||||
|
continue
|
||||||
|
cached_css_file = self.cache_file(f'https://www.notion.so{link["href"]}')
|
||||||
|
# files in the css file might be reference with a relative path,
|
||||||
|
# so store the path of the current css file
|
||||||
|
parent_css_path = os.path.split(urllib.parse.urlparse(link["href"]).path)[0]
|
||||||
|
# open the locally saved file
|
||||||
|
with open(self.dist_folder / cached_css_file, "rb+") as f:
|
||||||
|
stylesheet = cssutils.parseString(f.read())
|
||||||
|
# open the stylesheet and check for any font-face rule,
|
||||||
|
for rule in stylesheet.cssRules:
|
||||||
|
if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE:
|
||||||
|
# if any are found, download the font file
|
||||||
|
# TODO: maths fonts have fallback font sources
|
||||||
|
font_file = (
|
||||||
|
rule.style["src"].split("url(")[-1].split(")")[0]
|
||||||
|
)
|
||||||
|
# assemble the url given the current css path
|
||||||
|
font_url = "/".join(p.strip("/") for p in ["https://www.notion.so", parent_css_path, font_file] if p.strip("/"))
|
||||||
|
# don't hash the font files filenames, rather get filename only
|
||||||
|
cached_font_file = self.cache_file(font_url, Path(font_file).name)
|
||||||
|
rule.style["src"] = f"url({cached_font_file})"
|
||||||
|
# commit stylesheet edits to file
|
||||||
|
f.seek(0)
|
||||||
|
f.truncate()
|
||||||
|
f.write(stylesheet.cssText)
|
||||||
|
|
||||||
|
link["href"] = str(cached_css_file)
|
||||||
|
|
||||||
|
def add_toggle_custom_logic(self, soup):
|
||||||
|
# add our custom logic to all toggle blocks
|
||||||
|
for toggle_block in soup.findAll("div", {"class": "notion-toggle-block"}):
|
||||||
|
toggle_id = uuid.uuid4()
|
||||||
|
toggle_button = toggle_block.select_one("div[role=button]")
|
||||||
|
toggle_content = toggle_block.find("div", {"class": None, "style": ""})
|
||||||
|
if toggle_button and toggle_content:
|
||||||
|
# add a custom class to the toggle button and content,
|
||||||
|
# plus a custom attribute sharing a unique uiid so
|
||||||
|
# we can hook them up with some custom js logic later
|
||||||
|
toggle_button["class"] = toggle_block.get("class", []) + [
|
||||||
|
"loconotion-toggle-button"
|
||||||
|
]
|
||||||
|
toggle_content["class"] = toggle_content.get("class", []) + [
|
||||||
|
"loconotion-toggle-content"
|
||||||
|
]
|
||||||
|
toggle_content.attrs["loconotion-toggle-id"] = toggle_button.attrs[
|
||||||
|
"loconotion-toggle-id"
|
||||||
|
] = toggle_id
|
||||||
|
|
||||||
|
def process_table_views(self, soup):
|
||||||
|
# if there are any table views in the page, add links to the title rows
|
||||||
|
# the link to the row item is equal to its data-block-id without dashes
|
||||||
|
for table_view in soup.findAll("div", {"class": "notion-table-view"}):
|
||||||
|
for table_row in table_view.findAll(
|
||||||
|
"div", {"class": "notion-collection-item"}
|
||||||
|
):
|
||||||
|
table_row_block_id = table_row["data-block-id"]
|
||||||
|
table_row_href = "/" + table_row_block_id.replace("-", "")
|
||||||
|
row_target_span = table_row.find("span")
|
||||||
|
row_target_span["style"] = row_target_span["style"].replace("pointer-events: none;","")
|
||||||
|
row_link_wrapper = soup.new_tag(
|
||||||
|
"a", attrs={"href": table_row_href, "style": "cursor: pointer; color: inherit; text-decoration: none; fill: inherit;"}
|
||||||
|
)
|
||||||
|
row_target_span.wrap(row_link_wrapper)
|
||||||
|
|
||||||
|
def embed_custom_fonts(self, url, soup):
|
||||||
|
if not (custom_fonts := self.get_page_config(url).get("fonts", {})):
|
||||||
|
return
|
||||||
|
|
||||||
|
# append a stylesheet importing the google font for each unique font
|
||||||
|
unique_custom_fonts = set(custom_fonts.values())
|
||||||
|
for font in unique_custom_fonts:
|
||||||
|
if font:
|
||||||
|
google_fonts_embed_name = font.replace(" ", "+")
|
||||||
|
font_href = f"https://fonts.googleapis.com/css2?family={google_fonts_embed_name}:wght@500;600;700&display=swap"
|
||||||
|
custom_font_stylesheet = soup.new_tag(
|
||||||
|
"link", rel="stylesheet", href=font_href
|
||||||
|
)
|
||||||
|
soup.head.append(custom_font_stylesheet)
|
||||||
|
|
||||||
|
# go through each custom font, and add a css rule overriding the font-family
|
||||||
|
# to the font override stylesheet targetting the appropriate selector
|
||||||
|
font_override_stylesheet = soup.new_tag("style", type="text/css")
|
||||||
|
# embed custom google font(s)
|
||||||
|
fonts_selectors = {
|
||||||
|
"site": "div:not(.notion-code-block)",
|
||||||
|
"navbar": ".notion-topbar div",
|
||||||
|
"title": ".notion-page-block > div, .notion-collection_view_page-block > div[data-root]",
|
||||||
|
"h1": ".notion-header-block div, notion-page-content > notion-collection_view-block > div:first-child div",
|
||||||
|
"h2": ".notion-sub_header-block div",
|
||||||
|
"h3": ".notion-sub_sub_header-block div",
|
||||||
|
"body": ".notion-scroller",
|
||||||
|
"code": ".notion-code-block *",
|
||||||
|
}
|
||||||
|
for target, custom_font in custom_fonts.items():
|
||||||
|
if custom_font and target != "site":
|
||||||
|
log.debug(f"Setting {target} font-family to {custom_font}")
|
||||||
|
font_override_stylesheet.append(
|
||||||
|
fonts_selectors[target]
|
||||||
|
+ " {font-family:"
|
||||||
|
+ custom_font
|
||||||
|
+ " !important} "
|
||||||
|
)
|
||||||
|
|
||||||
|
site_font = custom_fonts.get("site", None)
|
||||||
|
if site_font:
|
||||||
|
log.debug(f"Setting global site font-family to {site_font}"),
|
||||||
|
font_override_stylesheet.append(
|
||||||
|
fonts_selectors["site"] + " {font-family:" + site_font + "} "
|
||||||
|
)
|
||||||
|
|
||||||
|
# finally append the font overrides stylesheets to the page
|
||||||
|
soup.head.append(font_override_stylesheet)
|
||||||
|
|
||||||
def inject_custom_tags(self, section: str, soup, custom_injects: dict):
|
def inject_custom_tags(self, section: str, soup, custom_injects: dict):
|
||||||
"""Inject custom tags to the given section.
|
"""Inject custom tags to the given section.
|
||||||
|
|
||||||
@ -667,6 +614,93 @@ class Parser:
|
|||||||
log.debug(f'Injecting <{section}> tag: {injected_tag}')
|
log.debug(f'Injecting <{section}> tag: {injected_tag}')
|
||||||
soup.find(section).append(injected_tag)
|
soup.find(section).append(injected_tag)
|
||||||
|
|
||||||
|
def inject_loconotion_script_and_css(self, soup):
|
||||||
|
# inject loconotion's custom stylesheet and script
|
||||||
|
loconotion_custom_css = self.cache_file(Path("bundles/loconotion.css"))
|
||||||
|
custom_css = soup.new_tag(
|
||||||
|
"link", rel="stylesheet", href=str(loconotion_custom_css)
|
||||||
|
)
|
||||||
|
soup.head.insert(-1, custom_css)
|
||||||
|
loconotion_custom_js = self.cache_file(Path("bundles/loconotion.js"))
|
||||||
|
custom_script = soup.new_tag(
|
||||||
|
"script", type="text/javascript", src=str(loconotion_custom_js)
|
||||||
|
)
|
||||||
|
soup.body.insert(-1, custom_script)
|
||||||
|
|
||||||
|
def find_subpages(self, url, index, soup, hrefDomain):
|
||||||
|
# find sub-pages and clean slugs / links
|
||||||
|
subpages = []
|
||||||
|
parse_links = not self.get_page_config(url).get("no-links", False)
|
||||||
|
for a in soup.find_all('a', href=True):
|
||||||
|
sub_page_href = a["href"]
|
||||||
|
if sub_page_href.startswith("/"):
|
||||||
|
sub_page_href = f'{hrefDomain}/{a["href"].split("/")[len(a["href"].split("/"))-1]}'
|
||||||
|
log.info(f"Got this as href {sub_page_href}")
|
||||||
|
if sub_page_href.startswith(hrefDomain):
|
||||||
|
if parse_links or not len(a.find_parents("div", class_="notion-scroller")):
|
||||||
|
# if the link is an anchor link,
|
||||||
|
# check if the page hasn't already been parsed
|
||||||
|
if "#" in sub_page_href:
|
||||||
|
sub_page_href_tokens = sub_page_href.split("#")
|
||||||
|
sub_page_href = sub_page_href_tokens[0]
|
||||||
|
a["href"] = f'#{sub_page_href_tokens[-1]}'
|
||||||
|
a["class"] = a.get("class", []) + ["loconotion-anchor-link"]
|
||||||
|
if (
|
||||||
|
sub_page_href in self.processed_pages.keys()
|
||||||
|
or sub_page_href in subpages
|
||||||
|
):
|
||||||
|
log.debug(
|
||||||
|
f"Original page for anchor link {sub_page_href}"
|
||||||
|
" already parsed / pending parsing, skipping"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
a["href"] = (
|
||||||
|
self.get_page_slug(sub_page_href)
|
||||||
|
if sub_page_href != index
|
||||||
|
else "index.html"
|
||||||
|
)
|
||||||
|
subpages.append(sub_page_href)
|
||||||
|
log.debug(f"Found link to page {a['href']}")
|
||||||
|
else:
|
||||||
|
# if the page is set not to follow any links, strip the href
|
||||||
|
# do this only on children of .notion-scroller, we don't want
|
||||||
|
# to strip the links from the top nav bar
|
||||||
|
log.debug(f"Stripping link for {a['href']}")
|
||||||
|
del a["href"]
|
||||||
|
a.name = "span"
|
||||||
|
# remove pointer cursor styling on the link and all children
|
||||||
|
for child in ([a] + a.find_all()):
|
||||||
|
if (child.has_attr("style")):
|
||||||
|
style = cssutils.parseStyle(child['style'])
|
||||||
|
style['cursor'] = "default"
|
||||||
|
child['style'] = style.cssText
|
||||||
|
return subpages
|
||||||
|
|
||||||
|
def export_parsed_page(self, url, index, soup):
|
||||||
|
# exports the parsed page
|
||||||
|
html_str = str(soup)
|
||||||
|
html_file = self.get_page_slug(url) if url != index else "index.html"
|
||||||
|
if html_file in self.processed_pages.values():
|
||||||
|
log.error(
|
||||||
|
f"Found duplicate pages with slug '{html_file}' - previous one will be"
|
||||||
|
" overwritten. Make sure that your notion pages names or custom slugs"
|
||||||
|
" in the configuration files are unique"
|
||||||
|
)
|
||||||
|
log.info(f"Exporting page '{url}' as '{html_file}'")
|
||||||
|
with open(self.dist_folder / html_file, "wb") as f:
|
||||||
|
f.write(html_str.encode("utf-8").strip())
|
||||||
|
self.processed_pages[url] = html_file
|
||||||
|
|
||||||
|
def parse_subpages(self, index, subpages):
|
||||||
|
# parse sub-pages
|
||||||
|
if subpages and not self.args.get("single_page", False):
|
||||||
|
if self.processed_pages:
|
||||||
|
log.debug(f"Pages processed so far: {len(self.processed_pages)}")
|
||||||
|
for sub_page in subpages:
|
||||||
|
if sub_page not in self.processed_pages.keys():
|
||||||
|
self.parse_page(sub_page, index=index)
|
||||||
|
|
||||||
def load(self, url):
|
def load(self, url):
|
||||||
self.driver.get(url)
|
self.driver.get(url)
|
||||||
WebDriverWait(self.driver, 60).until(notion_page_loaded())
|
WebDriverWait(self.driver, 60).until(notion_page_loaded())
|
||||||
|
Loading…
Reference in New Issue
Block a user