Split parse_page into several methods and some minor refactorings

Makes `parse_page` more readable
2024-08-30 18:12:12 +00:00 · 2022-02-18 11:57:45 +03:00 · 2022-02-18 11:57:45 +03:00 · 9b8ca4d771
commit 9b8ca4d771
parent 727121201d
1 changed files with 341 additions and 307 deletions
--- a/loconotion/notionparser.py
+++ b/loconotion/notionparser.py
@ -256,28 +256,63 @@ class Parser:
        log.info(f"Parsing page '{url}'")
        log.debug(f"Using page config: {self.get_page_config(url)}")

+        if not index:  # if this is the first page being parsed
+            index = url  # set it as the index.html
+
        try:
-            self.load(url)
-            if not index:
-                # if this is the first page being parsed, set it as the index.html
-                index = url
-                # if dark theme is enabled, set local storage item and re-load the page
-                if self.args.get("dark_theme", True):
-                    log.debug(f"Dark theme is enabled")
-                    self.driver.execute_script("window.localStorage.setItem('theme','{\"mode\":\"dark\"}');")
-                    self.load(url)
-        except TimeoutException as ex:
+            self.load_correct_theme(url)
+        except TimeoutException:
            log.critical(
                "Timeout waiting for page content to load, or no content found."
                " Are you sure the page is set to public?"
            )
            return

+        self.scroll_to_the_bottom()
+
+        # open the toggle blocks in the page
+        self.open_toggle_blocks(self.args["timeout"])
+
+        # creates soup from the page to start parsing
+        soup = BeautifulSoup(self.driver.page_source, "html.parser")
+
+        self.clean_up(soup)
+        self.set_custom_meta_tags(url, soup)
+        self.process_images_and_emojis(soup)
+        self.process_stylesheets(soup)
+        self.add_toggle_custom_logic(soup)
+        self.process_table_views(soup)
+        self.embed_custom_fonts(url, soup)
+
+        # inject any custom elements to the page
+        custom_injects = self.get_page_config(url).get("inject", {})
+        self.inject_custom_tags("head", soup, custom_injects)
+        self.inject_custom_tags("body", soup, custom_injects)
+
+        self.inject_loconotion_script_and_css(soup)
+
+        hrefDomain = f'{url.split("notion.site")[0]}notion.site'
+        log.info(f"Got the domain as {hrefDomain}")
+
+        subpages = self.find_subpages(url, index, soup, hrefDomain)
+        self.export_parsed_page(url, index, soup)
+        self.parse_subpages(index, subpages)
+
+    def load_correct_theme(self, url):
+        self.load(url)
+
+        # if dark theme is enabled, set local storage item and re-load the page
+        if self.args.get("dark_theme", True):
+            log.debug("Dark theme is enabled")
+            self.driver.execute_script("window.localStorage.setItem('theme','{\"mode\":\"dark\"}');")
+            self.load(url)
+
        # light theme is on by default
        # enable dark mode based on https://fruitionsite.com/ dark mode hack
        if self.config.get('theme') == 'dark':
            self.driver.execute_script("__console.environment.ThemeStore.setState({ mode: 'dark' });")
-
+\
+    def scroll_to_the_bottom(self):
        # scroll at the bottom of the notion-scroller element to load all elements
        # continue once there are no changes in height after a timeout
        # don't do this if the page has a calendar databse on it or it will load forever
@ -299,302 +334,6 @@ class Parser:
                    break
                last_height = new_height

-        # open the toggle blocks in the page
-        self.open_toggle_blocks(self.args["timeout"])
-
-        # creates soup from the page to start parsing
-        soup = BeautifulSoup(self.driver.page_source, "html.parser")
-
-        # remove scripts and other tags we don't want / need
-        for unwanted in soup.findAll("script"):
-            unwanted.decompose()
-        for intercom_frame in soup.findAll("iframe", {"id": "intercom-frame"}):
-            intercom_frame.decompose()
-        for intercom_div in soup.findAll("div", {"class": "intercom-lightweight-app"}):
-            intercom_div.decompose()
-        for overlay_div in soup.findAll("div", {"class": "notion-overlay-container"}):
-            overlay_div.decompose()
-        for vendors_css in soup.find_all("link", href=lambda x: x and "vendors~" in x):
-            vendors_css.decompose()
-
-        # collection selectors (List, Gallery, etc.) don't work, so remove them
-        for collection_selector in soup.findAll("div", {"class": "notion-collection-view-select"}):
-            collection_selector.decompose()
-
-        # clean up the default notion meta tags
-        for tag in [
-            "description",
-            "twitter:card",
-            "twitter:site",
-            "twitter:title",
-            "twitter:description",
-            "twitter:image",
-            "twitter:url",
-            "apple-itunes-app",
-        ]:
-            unwanted_tag = soup.find("meta", attrs={"name": tag})
-            if unwanted_tag:
-                unwanted_tag.decompose()
-        for tag in [
-            "og:site_name",
-            "og:type",
-            "og:url",
-            "og:title",
-            "og:description",
-            "og:image",
-        ]:
-            unwanted_og_tag = soup.find("meta", attrs={"property": tag})
-            if unwanted_og_tag:
-                unwanted_og_tag.decompose()
-
-        # set custom meta tags
-        custom_meta_tags = self.get_page_config(url).get("meta", [])
-        for custom_meta_tag in custom_meta_tags:
-            tag = soup.new_tag("meta")
-            for attr, value in custom_meta_tag.items():
-                tag.attrs[attr] = value
-            log.debug(f"Adding meta tag {str(tag)}")
-            soup.head.append(tag)
-
-        # process images & emojis
-        cache_images = True
-        for img in soup.findAll("img"):
-            if img.has_attr("src"):
-                if cache_images and not "data:image" in img["src"]:
-                    img_src = img["src"]
-                    # if the path starts with /, it's one of notion's predefined images
-                    if img["src"].startswith("/"):
-                        img_src = "https://www.notion.so" + img["src"]
-                        # notion's own default images urls are in a weird format, need to sanitize them
-                        # img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0]
-                        # if (not '.amazonaws' in img_src):
-                        # img_src = urllib.parse.unquote(img_src)
-
-                    cached_image = self.cache_file(img_src)
-                    img["src"] = cached_image
-                else:
-                    if img["src"].startswith("/"):
-                        img["src"] = "https://www.notion.so" + img["src"]
-
-            # on emoji images, cache their sprite sheet and re-set their background url
-            if img.has_attr("class") and "notion-emoji" in img["class"]:
-                style = cssutils.parseStyle(img["style"])
-                spritesheet = style["background"]
-                spritesheet_url = spritesheet[
-                                  spritesheet.find("(") + 1: spritesheet.find(")")
-                                  ]
-                cached_spritesheet_url = self.cache_file(
-                    "https://www.notion.so" + spritesheet_url
-                )
-                style["background"] = spritesheet.replace(
-                    spritesheet_url, str(cached_spritesheet_url)
-                )
-                img["style"] = style.cssText
-
-        # process stylesheets
-        for link in soup.findAll("link", rel="stylesheet"):
-            if link.has_attr("href") and link["href"].startswith("/"):
-                # we don't need the vendors stylesheet
-                if "vendors~" in link["href"]:
-                    continue
-                cached_css_file = self.cache_file("https://www.notion.so" + link["href"])
-                # files in the css file might be reference with a relative path,
-                # so store the path of the current css file
-                parent_css_path = os.path.split(urllib.parse.urlparse(link["href"]).path)[0]
-                # open the locally saved file
-                with open(self.dist_folder / cached_css_file, "rb+") as f:
-                    stylesheet = cssutils.parseString(f.read())
-                    # open the stylesheet and check for any font-face rule,
-                    for rule in stylesheet.cssRules:
-                        if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE:
-                            # if any are found, download the font file
-                            # TODO: maths fonts have fallback font sources
-                            font_file = (
-                                rule.style["src"].split("url(")[-1].split(")")[0]
-                            )
-                            # assemble the url given the current css path
-                            font_url = "/".join(p.strip("/") for p in ["https://www.notion.so", parent_css_path, font_file] if p.strip("/"))
-                            # don't hash the font files filenames, rather get filename only
-                            cached_font_file = self.cache_file(font_url, Path(font_file).name)
-                            rule.style["src"] = f"url({cached_font_file})"
-                    # commit stylesheet edits to file
-                    f.seek(0)
-                    f.truncate()
-                    f.write(stylesheet.cssText)
-
-                link["href"] = str(cached_css_file)
-
-        # add our custom logic to all toggle blocks
-        for toggle_block in soup.findAll("div", {"class": "notion-toggle-block"}):
-            toggle_id = uuid.uuid4()
-            toggle_button = toggle_block.select_one("div[role=button]")
-            toggle_content = toggle_block.find("div", {"class": None, "style": ""})
-            if toggle_button and toggle_content:
-                # add a custom class to the toggle button and content,
-                # plus a custom attribute sharing a unique uiid so
-                # we can hook them up with some custom js logic later
-                toggle_button["class"] = toggle_block.get("class", []) + [
-                    "loconotion-toggle-button"
-                ]
-                toggle_content["class"] = toggle_content.get("class", []) + [
-                    "loconotion-toggle-content"
-                ]
-                toggle_content.attrs["loconotion-toggle-id"] = toggle_button.attrs[
-                    "loconotion-toggle-id"
-                ] = toggle_id
-
-        # if there are any table views in the page, add links to the title rows
-        # the link to the row item is equal to its data-block-id without dashes
-        for table_view in soup.findAll("div", {"class": "notion-table-view"}):
-            for table_row in table_view.findAll(
-                    "div", {"class": "notion-collection-item"}
-            ):
-                table_row_block_id = table_row["data-block-id"]
-                table_row_href = "/" + table_row_block_id.replace("-", "")
-                row_target_span = table_row.find("span")
-                row_target_span["style"] = row_target_span["style"].replace("pointer-events: none;","")
-                row_link_wrapper = soup.new_tag(
-                    "a", attrs={"href": table_row_href, "style": "cursor: pointer; color: inherit; text-decoration: none; fill: inherit;"}
-                )
-                row_target_span.wrap(row_link_wrapper)
-
-        # embed custom google font(s)
-        fonts_selectors = {
-            "site": "div:not(.notion-code-block)",
-            "navbar": ".notion-topbar div",
-            "title": ".notion-page-block > div, .notion-collection_view_page-block > div[data-root]",
-            "h1": ".notion-header-block div, notion-page-content > notion-collection_view-block > div:first-child div",
-            "h2": ".notion-sub_header-block div",
-            "h3": ".notion-sub_sub_header-block div",
-            "body": ".notion-scroller",
-            "code": ".notion-code-block *",
-        }
-        custom_fonts = self.get_page_config(url).get("fonts", {})
-        if custom_fonts:
-            # append a stylesheet importing the google font for each unique font
-            unique_custom_fonts = set(custom_fonts.values())
-            for font in unique_custom_fonts:
-                if font:
-                    google_fonts_embed_name = font.replace(" ", "+")
-                    font_href = f"https://fonts.googleapis.com/css2?family={google_fonts_embed_name}:wght@500;600;700&display=swap"
-                    custom_font_stylesheet = soup.new_tag(
-                        "link", rel="stylesheet", href=font_href
-                    )
-                    soup.head.append(custom_font_stylesheet)
-
-            # go through each custom font, and add a css rule overriding the font-family
-            # to the font override stylesheet targetting the appropriate selector
-            font_override_stylesheet = soup.new_tag("style", type="text/css")
-            for target, custom_font in custom_fonts.items():
-                if custom_font and not target == "site":
-                    log.debug(f"Setting {target} font-family to {custom_font}")
-                    font_override_stylesheet.append(
-                        fonts_selectors[target]
-                        + " {font-family:"
-                        + custom_font
-                        + " !important} "
-                    )
-            site_font = custom_fonts.get("site", None)
-            # process global site font last to more granular settings can override it
-            if site_font:
-                log.debug(f"Setting global site font-family to {site_font}"),
-                font_override_stylesheet.append(
-                    fonts_selectors["site"] + " {font-family:" + site_font + "} "
-                )
-            # finally append the font overrides stylesheets to the page
-            soup.head.append(font_override_stylesheet)
-
-        # inject any custom elements to the page
-        custom_injects = self.get_page_config(url).get("inject", {})
-
-        self.inject_custom_tags("head", soup, custom_injects)
-        self.inject_custom_tags("body", soup, custom_injects)
-
-        # inject loconotion's custom stylesheet and script
-        loconotion_custom_css = self.cache_file(Path("bundles/loconotion.css"))
-        custom_css = soup.new_tag(
-            "link", rel="stylesheet", href=str(loconotion_custom_css)
-        )
-        soup.head.insert(-1, custom_css)
-        loconotion_custom_js = self.cache_file(Path("bundles/loconotion.js"))
-        custom_script = soup.new_tag(
-            "script", type="text/javascript", src=str(loconotion_custom_js)
-        )
-        soup.body.insert(-1, custom_script)
-
-        hrefDomain = url.split('notion.site')[0] + 'notion.site'
-        log.info(f"Got the domain as {hrefDomain}")
-
-        # find sub-pages and clean slugs / links
-        sub_pages = []
-        parse_links = not self.get_page_config(url).get("no-links", False)
-        for a in soup.find_all('a', href=True):
-            sub_page_href = a["href"]
-            if sub_page_href.startswith("/"):
-                sub_page_href = hrefDomain + '/'+ a["href"].split('/')[len(a["href"].split('/'))-1]
-                log.info(f"Got this as href {sub_page_href}")
-            if sub_page_href.startswith(hrefDomain):
-                if parse_links or not len(a.find_parents("div", class_="notion-scroller")):
-                    # if the link is an anchor link,
-                    # check if the page hasn't already been parsed
-                    if "#" in sub_page_href:
-                        sub_page_href_tokens = sub_page_href.split("#")
-                        sub_page_href = sub_page_href_tokens[0]
-                        a["href"] = "#" + sub_page_href_tokens[-1]
-                        a["class"] = a.get("class", []) + ["loconotion-anchor-link"]
-                        if (
-                                sub_page_href in self.processed_pages.keys()
-                                or sub_page_href in sub_pages
-                        ):
-                            log.debug(
-                                f"Original page for anchor link {sub_page_href}"
-                                " already parsed / pending parsing, skipping"
-                            )
-                            continue
-                    else:
-                        a["href"] = (
-                            self.get_page_slug(sub_page_href)
-                            if sub_page_href != index
-                            else "index.html"
-                        )
-                    sub_pages.append(sub_page_href)
-                    log.debug(f"Found link to page {a['href']}")
-                else:
-                    # if the page is set not to follow any links, strip the href
-                    # do this only on children of .notion-scroller, we don't want
-                    # to strip the links from the top nav bar
-                    log.debug(f"Stripping link for {a['href']}")
-                    del a["href"]
-                    a.name = "span"
-                    # remove pointer cursor styling on the link and all children
-                    for child in ([a] + a.find_all()):
-                        if (child.has_attr("style")):
-                            style = cssutils.parseStyle(child['style'])
-                            style['cursor'] = "default"
-                            child['style'] = style.cssText
-
-        # exports the parsed page
-        html_str = str(soup)
-        html_file = self.get_page_slug(url) if url != index else "index.html"
-        if html_file in self.processed_pages.values():
-            log.error(
-                f"Found duplicate pages with slug '{html_file}' - previous one will be"
-                " overwritten. Make sure that your notion pages names or custom slugs"
-                " in the configuration files are unique"
-            )
-        log.info(f"Exporting page '{url}' as '{html_file}'")
-        with open(self.dist_folder / html_file, "wb") as f:
-            f.write(html_str.encode("utf-8").strip())
-        self.processed_pages[url] = html_file
-
-        # parse sub-pages
-        if sub_pages and not self.args.get("single_page", False):
-            if self.processed_pages:
-                log.debug(f"Pages processed so far: {len(self.processed_pages)}")
-            for sub_page in sub_pages:
-                if not sub_page in self.processed_pages.keys():
-                    self.parse_page(sub_page, index=index)
-
    def open_toggle_blocks(self, timeout: int, exclude=[]):
        """Expand all the toggle block in the page to make their content visible

@ -641,6 +380,214 @@ class Parser:
            # if so, run the function again
            self.open_toggle_blocks(timeout, opened_toggles)

+    def clean_up(self, soup):
+        # remove scripts and other tags we don't want / need
+        for unwanted in soup.findAll("script"):
+            unwanted.decompose()
+        for intercom_frame in soup.findAll("iframe", {"id": "intercom-frame"}):
+            intercom_frame.decompose()
+        for intercom_div in soup.findAll("div", {"class": "intercom-lightweight-app"}):
+            intercom_div.decompose()
+        for overlay_div in soup.findAll("div", {"class": "notion-overlay-container"}):
+            overlay_div.decompose()
+        for vendors_css in soup.find_all("link", href=lambda x: x and "vendors~" in x):
+            vendors_css.decompose()
+
+        # collection selectors (List, Gallery, etc.) don't work, so remove them
+        for collection_selector in soup.findAll("div", {"class": "notion-collection-view-select"}):
+            collection_selector.decompose()
+
+        # clean up the default notion meta tags
+        for tag in [
+            "description",
+            "twitter:card",
+            "twitter:site",
+            "twitter:title",
+            "twitter:description",
+            "twitter:image",
+            "twitter:url",
+            "apple-itunes-app",
+        ]:
+            unwanted_tag = soup.find("meta", attrs={"name": tag})
+            if unwanted_tag:
+                unwanted_tag.decompose()
+        for tag in [
+            "og:site_name",
+            "og:type",
+            "og:url",
+            "og:title",
+            "og:description",
+            "og:image",
+        ]:
+            unwanted_og_tag = soup.find("meta", attrs={"property": tag})
+            if unwanted_og_tag:
+                unwanted_og_tag.decompose()
+
+    def set_custom_meta_tags(self, url, soup):
+        # set custom meta tags
+        custom_meta_tags = self.get_page_config(url).get("meta", [])
+        for custom_meta_tag in custom_meta_tags:
+            tag = soup.new_tag("meta")
+            for attr, value in custom_meta_tag.items():
+                tag.attrs[attr] = value
+            log.debug(f"Adding meta tag {str(tag)}")
+            soup.head.append(tag)
+
+    def process_images_and_emojis(self, soup):
+        # process images & emojis
+        cache_images = True
+        for img in soup.findAll("img"):
+            if img.has_attr("src"):
+                if cache_images and "data:image" not in img["src"]:
+                    img_src = img["src"]
+                    # if the path starts with /, it's one of notion's predefined images
+                    if img["src"].startswith("/"):
+                        img_src = f'https://www.notion.so{img["src"]}'
+                        # notion's own default images urls are in a weird format, need to sanitize them
+                        # img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0]
+                        # if (not '.amazonaws' in img_src):
+                        # img_src = urllib.parse.unquote(img_src)
+
+                    cached_image = self.cache_file(img_src)
+                    img["src"] = cached_image
+                elif img["src"].startswith("/"):
+                    img["src"] = f'https://www.notion.so{img["src"]}'
+
+            # on emoji images, cache their sprite sheet and re-set their background url
+            if img.has_attr("class") and "notion-emoji" in img["class"]:
+                style = cssutils.parseStyle(img["style"])
+                spritesheet = style["background"]
+                spritesheet_url = spritesheet[
+                                  spritesheet.find("(") + 1: spritesheet.find(")")
+                                  ]
+                cached_spritesheet_url = self.cache_file(
+                    f'https://www.notion.so{spritesheet_url}'
+                )
+
+                style["background"] = spritesheet.replace(
+                    spritesheet_url, str(cached_spritesheet_url)
+                )
+                img["style"] = style.cssText
+
+    def process_stylesheets(self, soup):
+        # process stylesheets
+        for link in soup.findAll("link", rel="stylesheet"):
+            if link.has_attr("href") and link["href"].startswith("/"):
+                # we don't need the vendors stylesheet
+                if "vendors~" in link["href"]:
+                    continue
+                cached_css_file = self.cache_file(f'https://www.notion.so{link["href"]}')
+                # files in the css file might be reference with a relative path,
+                # so store the path of the current css file
+                parent_css_path = os.path.split(urllib.parse.urlparse(link["href"]).path)[0]
+                # open the locally saved file
+                with open(self.dist_folder / cached_css_file, "rb+") as f:
+                    stylesheet = cssutils.parseString(f.read())
+                    # open the stylesheet and check for any font-face rule,
+                    for rule in stylesheet.cssRules:
+                        if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE:
+                            # if any are found, download the font file
+                            # TODO: maths fonts have fallback font sources
+                            font_file = (
+                                rule.style["src"].split("url(")[-1].split(")")[0]
+                            )
+                            # assemble the url given the current css path
+                            font_url = "/".join(p.strip("/") for p in ["https://www.notion.so", parent_css_path, font_file] if p.strip("/"))
+                            # don't hash the font files filenames, rather get filename only
+                            cached_font_file = self.cache_file(font_url, Path(font_file).name)
+                            rule.style["src"] = f"url({cached_font_file})"
+                    # commit stylesheet edits to file
+                    f.seek(0)
+                    f.truncate()
+                    f.write(stylesheet.cssText)
+
+                link["href"] = str(cached_css_file)
+
+    def add_toggle_custom_logic(self, soup):
+        # add our custom logic to all toggle blocks
+        for toggle_block in soup.findAll("div", {"class": "notion-toggle-block"}):
+            toggle_id = uuid.uuid4()
+            toggle_button = toggle_block.select_one("div[role=button]")
+            toggle_content = toggle_block.find("div", {"class": None, "style": ""})
+            if toggle_button and toggle_content:
+                # add a custom class to the toggle button and content,
+                # plus a custom attribute sharing a unique uiid so
+                # we can hook them up with some custom js logic later
+                toggle_button["class"] = toggle_block.get("class", []) + [
+                    "loconotion-toggle-button"
+                ]
+                toggle_content["class"] = toggle_content.get("class", []) + [
+                    "loconotion-toggle-content"
+                ]
+                toggle_content.attrs["loconotion-toggle-id"] = toggle_button.attrs[
+                    "loconotion-toggle-id"
+                ] = toggle_id
+
+    def process_table_views(self, soup):
+        # if there are any table views in the page, add links to the title rows
+        # the link to the row item is equal to its data-block-id without dashes
+        for table_view in soup.findAll("div", {"class": "notion-table-view"}):
+            for table_row in table_view.findAll(
+                    "div", {"class": "notion-collection-item"}
+            ):
+                table_row_block_id = table_row["data-block-id"]
+                table_row_href = "/" + table_row_block_id.replace("-", "")
+                row_target_span = table_row.find("span")
+                row_target_span["style"] = row_target_span["style"].replace("pointer-events: none;","")
+                row_link_wrapper = soup.new_tag(
+                    "a", attrs={"href": table_row_href, "style": "cursor: pointer; color: inherit; text-decoration: none; fill: inherit;"}
+                )
+                row_target_span.wrap(row_link_wrapper)
+
+    def embed_custom_fonts(self, url, soup):
+        if not (custom_fonts := self.get_page_config(url).get("fonts", {})):
+            return
+
+        # append a stylesheet importing the google font for each unique font
+        unique_custom_fonts = set(custom_fonts.values())
+        for font in unique_custom_fonts:
+            if font:
+                google_fonts_embed_name = font.replace(" ", "+")
+                font_href = f"https://fonts.googleapis.com/css2?family={google_fonts_embed_name}:wght@500;600;700&display=swap"
+                custom_font_stylesheet = soup.new_tag(
+                    "link", rel="stylesheet", href=font_href
+                )
+                soup.head.append(custom_font_stylesheet)
+
+        # go through each custom font, and add a css rule overriding the font-family
+        # to the font override stylesheet targetting the appropriate selector
+        font_override_stylesheet = soup.new_tag("style", type="text/css")
+        # embed custom google font(s)
+        fonts_selectors = {
+            "site": "div:not(.notion-code-block)",
+            "navbar": ".notion-topbar div",
+            "title": ".notion-page-block > div, .notion-collection_view_page-block > div[data-root]",
+            "h1": ".notion-header-block div, notion-page-content > notion-collection_view-block > div:first-child div",
+            "h2": ".notion-sub_header-block div",
+            "h3": ".notion-sub_sub_header-block div",
+            "body": ".notion-scroller",
+            "code": ".notion-code-block *",
+        }
+        for target, custom_font in custom_fonts.items():
+            if custom_font and target != "site":
+                log.debug(f"Setting {target} font-family to {custom_font}")
+                font_override_stylesheet.append(
+                    fonts_selectors[target]
+                    + " {font-family:"
+                    + custom_font
+                    + " !important} "
+                )
+
+        site_font = custom_fonts.get("site", None)
+        if site_font:
+            log.debug(f"Setting global site font-family to {site_font}"),
+            font_override_stylesheet.append(
+                fonts_selectors["site"] + " {font-family:" + site_font + "} "
+            )
+
+        # finally append the font overrides stylesheets to the page
+        soup.head.append(font_override_stylesheet)
+
    def inject_custom_tags(self, section: str, soup, custom_injects: dict):
        """Inject custom tags to the given section.

@ -667,6 +614,93 @@ class Parser:
                log.debug(f'Injecting <{section}> tag: {injected_tag}')
                soup.find(section).append(injected_tag)

+    def inject_loconotion_script_and_css(self, soup):
+        # inject loconotion's custom stylesheet and script
+        loconotion_custom_css = self.cache_file(Path("bundles/loconotion.css"))
+        custom_css = soup.new_tag(
+            "link", rel="stylesheet", href=str(loconotion_custom_css)
+        )
+        soup.head.insert(-1, custom_css)
+        loconotion_custom_js = self.cache_file(Path("bundles/loconotion.js"))
+        custom_script = soup.new_tag(
+            "script", type="text/javascript", src=str(loconotion_custom_js)
+        )
+        soup.body.insert(-1, custom_script)
+
+    def find_subpages(self, url, index, soup, hrefDomain):
+        # find sub-pages and clean slugs / links
+        subpages = []
+        parse_links = not self.get_page_config(url).get("no-links", False)
+        for a in soup.find_all('a', href=True):
+            sub_page_href = a["href"]
+            if sub_page_href.startswith("/"):
+                sub_page_href = f'{hrefDomain}/{a["href"].split("/")[len(a["href"].split("/"))-1]}'
+                log.info(f"Got this as href {sub_page_href}")
+            if sub_page_href.startswith(hrefDomain):
+                if parse_links or not len(a.find_parents("div", class_="notion-scroller")):
+                    # if the link is an anchor link,
+                    # check if the page hasn't already been parsed
+                    if "#" in sub_page_href:
+                        sub_page_href_tokens = sub_page_href.split("#")
+                        sub_page_href = sub_page_href_tokens[0]
+                        a["href"] = f'#{sub_page_href_tokens[-1]}'
+                        a["class"] = a.get("class", []) + ["loconotion-anchor-link"]
+                        if (
+                                sub_page_href in self.processed_pages.keys()
+                                or sub_page_href in subpages
+                        ):
+                            log.debug(
+                                f"Original page for anchor link {sub_page_href}"
+                                " already parsed / pending parsing, skipping"
+                            )
+                            continue
+                    else:
+                        a["href"] = (
+                            self.get_page_slug(sub_page_href)
+                            if sub_page_href != index
+                            else "index.html"
+                        )
+                    subpages.append(sub_page_href)
+                    log.debug(f"Found link to page {a['href']}")
+                else:
+                    # if the page is set not to follow any links, strip the href
+                    # do this only on children of .notion-scroller, we don't want
+                    # to strip the links from the top nav bar
+                    log.debug(f"Stripping link for {a['href']}")
+                    del a["href"]
+                    a.name = "span"
+                    # remove pointer cursor styling on the link and all children
+                    for child in ([a] + a.find_all()):
+                        if (child.has_attr("style")):
+                            style = cssutils.parseStyle(child['style'])
+                            style['cursor'] = "default"
+                            child['style'] = style.cssText
+        return subpages
+
+    def export_parsed_page(self, url, index, soup):
+        # exports the parsed page
+        html_str = str(soup)
+        html_file = self.get_page_slug(url) if url != index else "index.html"
+        if html_file in self.processed_pages.values():
+            log.error(
+                f"Found duplicate pages with slug '{html_file}' - previous one will be"
+                " overwritten. Make sure that your notion pages names or custom slugs"
+                " in the configuration files are unique"
+            )
+        log.info(f"Exporting page '{url}' as '{html_file}'")
+        with open(self.dist_folder / html_file, "wb") as f:
+            f.write(html_str.encode("utf-8").strip())
+        self.processed_pages[url] = html_file
+
+    def parse_subpages(self, index, subpages):
+        # parse sub-pages
+        if subpages and not self.args.get("single_page", False):
+            if self.processed_pages:
+                log.debug(f"Pages processed so far: {len(self.processed_pages)}")
+            for sub_page in subpages:
+                if sub_page not in self.processed_pages.keys():
+                    self.parse_page(sub_page, index=index)
+
    def load(self, url):
        self.driver.get(url)
        WebDriverWait(self.driver, 60).until(notion_page_loaded())