mirror of
https://github.com/leoncvlt/loconotion.git
synced 2024-08-30 18:12:12 +00:00
Added no-links config option to skip links scraping
This commit is contained in:
parent
71fbbb2cec
commit
4c4c5069a4
11
README.md
11
README.md
@ -184,13 +184,20 @@ theme = "dark"
|
||||
[pages.d2fa06f244e64f66880bb0491f58223d.fonts]
|
||||
title = 'DM Mono'
|
||||
|
||||
# set up pretty slugs for the other database pages
|
||||
# set up pretty slugs and options for the other database pages
|
||||
[pages.54dab6011e604430a21dc477cb8e4e3a]
|
||||
slug = "film-gallery"
|
||||
|
||||
[pages.2604ce45890645c79f67d92833083fee]
|
||||
slug = "books-table"
|
||||
[pages.ae0a85c527824a3a855b7f4d31f4e0fc]
|
||||
|
||||
# don't follow any link on the page, skipping parsing sub-pages linked from this one
|
||||
# useful for large tables where we don't want individual pages for each item
|
||||
no-links = true
|
||||
|
||||
[pages.a28dba2e7a67448da52f2cd2c641407b]
|
||||
slug = "random-board"
|
||||
no-links = true
|
||||
```
|
||||
|
||||
On top of this, the script can take these optional arguments:
|
||||
|
@ -90,10 +90,17 @@ page = "https://www.notion.so/Loconotion-Example-Page-03c403f4fdc94cc1b315b9469a
|
||||
[pages.d2fa06f244e64f66880bb0491f58223d.fonts]
|
||||
body = 'DM Mono'
|
||||
|
||||
# set up pretty slugs for the other database pages
|
||||
# set up pretty slugs and options for the other database pages
|
||||
[pages.54dab6011e604430a21dc477cb8e4e3a]
|
||||
slug = "film-gallery"
|
||||
|
||||
[pages.2604ce45890645c79f67d92833083fee]
|
||||
slug = "books-table"
|
||||
[pages.ae0a85c527824a3a855b7f4d31f4e0fc]
|
||||
slug = "random-board"
|
||||
|
||||
# don't follow any link on the page, skipping parsing sub-pages linked from this one
|
||||
# useful for large tables where we don't want individual pages for each item
|
||||
no-links = true
|
||||
|
||||
[pages.a28dba2e7a67448da52f2cd2c641407b]
|
||||
slug = "random-board"
|
||||
no-links = true
|
@ -576,36 +576,52 @@ class Parser:
|
||||
|
||||
# find sub-pages and clean slugs / links
|
||||
sub_pages = []
|
||||
parse_links = not self.get_page_config(url).get("no-links", False)
|
||||
for a in soup.find_all('a', href=True):
|
||||
sub_page_href = a["href"]
|
||||
if sub_page_href.startswith("/"):
|
||||
sub_page_href = "https://www.notion.so" + a["href"]
|
||||
if sub_page_href.startswith("https://www.notion.so/"):
|
||||
# if the link is an anchor link,
|
||||
# check if the page hasn't already been parsed
|
||||
if "#" in sub_page_href:
|
||||
sub_page_href_tokens = sub_page_href.split("#")
|
||||
sub_page_href = sub_page_href_tokens[0]
|
||||
a["href"] = "#" + sub_page_href_tokens[-1]
|
||||
a["class"] = a.get("class", []) + ["loconotion-anchor-link"]
|
||||
if (
|
||||
sub_page_href in processed_pages.keys()
|
||||
or sub_page_href in sub_pages
|
||||
):
|
||||
log.debug(
|
||||
f"Original page for anchor link {sub_page_href}"
|
||||
" already parsed / pending parsing, skipping"
|
||||
if parse_links or not len(a.find_parents("div", class_="notion-scroller")):
|
||||
# if the link is an anchor link,
|
||||
# check if the page hasn't already been parsed
|
||||
if "#" in sub_page_href:
|
||||
sub_page_href_tokens = sub_page_href.split("#")
|
||||
sub_page_href = sub_page_href_tokens[0]
|
||||
a["href"] = "#" + sub_page_href_tokens[-1]
|
||||
a["class"] = a.get("class", []) + ["loconotion-anchor-link"]
|
||||
if (
|
||||
sub_page_href in processed_pages.keys()
|
||||
or sub_page_href in sub_pages
|
||||
):
|
||||
log.debug(
|
||||
f"Original page for anchor link {sub_page_href}"
|
||||
" already parsed / pending parsing, skipping"
|
||||
)
|
||||
continue
|
||||
else:
|
||||
a["href"] = (
|
||||
self.get_page_slug(sub_page_href)
|
||||
if sub_page_href != index
|
||||
else "index.html"
|
||||
)
|
||||
continue
|
||||
sub_pages.append(sub_page_href)
|
||||
log.debug(f"Found link to page {a['href']}")
|
||||
else:
|
||||
a["href"] = (
|
||||
self.get_page_slug(sub_page_href)
|
||||
if sub_page_href != index
|
||||
else "index.html"
|
||||
)
|
||||
sub_pages.append(sub_page_href)
|
||||
log.debug(f"Found link to page {a['href']}")
|
||||
# if the page is set not to follow any links, strip the href
|
||||
# do this only on children of .notion-scroller, we don't want
|
||||
# to strip the links from the top nav bar
|
||||
log.debug(f"Stripping link for {a['href']}")
|
||||
del a["href"]
|
||||
a.name = "span"
|
||||
# remove pointer cursor styling on the link and all children
|
||||
for child in ([a] + a.find_all()):
|
||||
if (child.has_attr("style")):
|
||||
style = cssutils.parseStyle(child['style'])
|
||||
style['cursor'] = "default"
|
||||
child['style'] = style.cssText
|
||||
|
||||
|
||||
# exports the parsed page
|
||||
html_str = str(soup)
|
||||
html_file = self.get_page_slug(url) if url != index else "index.html"
|
||||
|
Loading…
Reference in New Issue
Block a user