Added no-links config option to skip links scraping

2024-08-30 18:12:12 +00:00 · 2021-02-07 12:56:51 +00:00 · 2021-02-07 12:56:51 +00:00 · 4c4c5069a4
commit 4c4c5069a4
parent 71fbbb2cec
3 changed files with 57 additions and 27 deletions
--- a/README.md
+++ b/README.md
@ -184,13 +184,20 @@ theme = "dark"
    [pages.d2fa06f244e64f66880bb0491f58223d.fonts]
    title = 'DM Mono' 

-  # set up pretty slugs for the other database pages
+  # set up pretty slugs and options for the other database pages
  [pages.54dab6011e604430a21dc477cb8e4e3a]
    slug = "film-gallery"
+
  [pages.2604ce45890645c79f67d92833083fee]
    slug = "books-table"
-  [pages.ae0a85c527824a3a855b7f4d31f4e0fc]
+
+    # don't follow any link on the page, skipping parsing sub-pages linked from this one
+    # useful for large tables where we don't want individual pages for each item
+    no-links = true
+
+  [pages.a28dba2e7a67448da52f2cd2c641407b]
    slug = "random-board"
+    no-links = true
 ```

 On top of this, the script can take these optional arguments:
--- a/example/example_site.toml
+++ b/example/example_site.toml
@ -90,10 +90,17 @@ page = "https://www.notion.so/Loconotion-Example-Page-03c403f4fdc94cc1b315b9469a
    [pages.d2fa06f244e64f66880bb0491f58223d.fonts]
    body = 'DM Mono' 

-  # set up pretty slugs for the other database pages
+  # set up pretty slugs and options for the other database pages
  [pages.54dab6011e604430a21dc477cb8e4e3a]
    slug = "film-gallery"
+
  [pages.2604ce45890645c79f67d92833083fee]
    slug = "books-table"
-  [pages.ae0a85c527824a3a855b7f4d31f4e0fc]
-    slug = "random-board"
+
+    # don't follow any link on the page, skipping parsing sub-pages linked from this one
+    # useful for large tables where we don't want individual pages for each item
+    no-links = true
+
+  [pages.a28dba2e7a67448da52f2cd2c641407b]
+    slug = "random-board"
+    no-links = true
--- a/loconotion/notionparser.py
+++ b/loconotion/notionparser.py
@ -576,36 +576,52 @@ class Parser:

        # find sub-pages and clean slugs / links
        sub_pages = []
+        parse_links = not self.get_page_config(url).get("no-links", False)
        for a in soup.find_all('a', href=True):
            sub_page_href = a["href"]
            if sub_page_href.startswith("/"):
                sub_page_href = "https://www.notion.so" + a["href"]
            if sub_page_href.startswith("https://www.notion.so/"):
-                # if the link is an anchor link,
-                # check if the page hasn't already been parsed
-                if "#" in sub_page_href:
-                    sub_page_href_tokens = sub_page_href.split("#")
-                    sub_page_href = sub_page_href_tokens[0]
-                    a["href"] = "#" + sub_page_href_tokens[-1]
-                    a["class"] = a.get("class", []) + ["loconotion-anchor-link"]
-                    if (
-                            sub_page_href in processed_pages.keys()
-                            or sub_page_href in sub_pages
-                    ):
-                        log.debug(
-                            f"Original page for anchor link {sub_page_href}"
-                            " already parsed / pending parsing, skipping"
+                if parse_links or not len(a.find_parents("div", class_="notion-scroller")):
+                    # if the link is an anchor link,
+                    # check if the page hasn't already been parsed
+                    if "#" in sub_page_href:
+                        sub_page_href_tokens = sub_page_href.split("#")
+                        sub_page_href = sub_page_href_tokens[0]
+                        a["href"] = "#" + sub_page_href_tokens[-1]
+                        a["class"] = a.get("class", []) + ["loconotion-anchor-link"]
+                        if (
+                                sub_page_href in processed_pages.keys()
+                                or sub_page_href in sub_pages
+                        ):
+                            log.debug(
+                                f"Original page for anchor link {sub_page_href}"
+                                " already parsed / pending parsing, skipping"
+                            )
+                            continue
+                    else:
+                        a["href"] = (
+                            self.get_page_slug(sub_page_href)
+                            if sub_page_href != index
+                            else "index.html"
                        )
-                        continue
+                    sub_pages.append(sub_page_href)
+                    log.debug(f"Found link to page {a['href']}")
                else:
-                    a["href"] = (
-                        self.get_page_slug(sub_page_href)
-                        if sub_page_href != index
-                        else "index.html"
-                    )
-                sub_pages.append(sub_page_href)
-                log.debug(f"Found link to page {a['href']}")
+                    # if the page is set not to follow any links, strip the href
+                    # do this only on children of .notion-scroller, we don't want
+                    # to strip the links from the top nav bar
+                    log.debug(f"Stripping link for {a['href']}")
+                    del a["href"]
+                    a.name = "span"
+                    # remove pointer cursor styling on the link and all children
+                    for child in ([a] + a.find_all()):
+                        if (child.has_attr("style")):
+                            style = cssutils.parseStyle(child['style'])
+                            style['cursor'] = "default"
+                            child['style'] = style.cssText

+            
        # exports the parsed page
        html_str = str(soup)
        html_file = self.get_page_slug(url) if url != index else "index.html"