Use new Notion url format

This commit is contained in:
Sunny Srivastava 2021-08-23 07:53:30 +05:30
parent 4bd84256e0
commit 7f380e406b

View File

@ -573,15 +573,19 @@ class Parser:
"script", type="text/javascript", src=str(loconotion_custom_js) "script", type="text/javascript", src=str(loconotion_custom_js)
) )
soup.body.insert(-1, custom_script) soup.body.insert(-1, custom_script)
log.info(f"Got this as main page URL>> {url}")
hrefDomain = url.split('notion.site')[0] + 'notion.site'
# find sub-pages and clean slugs / links # find sub-pages and clean slugs / links
sub_pages = [] sub_pages = []
parse_links = not self.get_page_config(url).get("no-links", False) parse_links = not self.get_page_config(url).get("no-links", False)
for a in soup.find_all('a', href=True): for a in soup.find_all('a', href=True):
sub_page_href = a["href"] sub_page_href = a["href"]
if sub_page_href.startswith("/"): if sub_page_href.startswith("/"):
sub_page_href = "https://www.notion.so" + a["href"] sub_page_href = hrefDomain + '/'+ a["href"].split('/')[len(a["href"].split('/'))-1]
if sub_page_href.startswith("https://www.notion.so/"): log.info(f"Got this as href {sub_page_href}")
if sub_page_href.startswith(hrefDomain):
if parse_links or not len(a.find_parents("div", class_="notion-scroller")): if parse_links or not len(a.find_parents("div", class_="notion-scroller")):
# if the link is an anchor link, # if the link is an anchor link,
# check if the page hasn't already been parsed # check if the page hasn't already been parsed
@ -632,6 +636,8 @@ class Parser:
" in the configuration files are unique" " in the configuration files are unique"
) )
log.info(f"Exporting page '{url}' as '{html_file}'") log.info(f"Exporting page '{url}' as '{html_file}'")
myfile = Path(self.dist_folder / html_file)
myfile.touch(exist_ok=True)
with open(self.dist_folder / html_file, "wb") as f: with open(self.dist_folder / html_file, "wb") as f:
f.write(html_str.encode("utf-8").strip()) f.write(html_str.encode("utf-8").strip())
processed_pages[url] = html_file processed_pages[url] = html_file