mirror of
https://github.com/leoncvlt/loconotion.git
synced 2024-08-30 18:12:12 +00:00
Make processed_pages
an attribute of Parser
This removes the dangerous practice of passing function results through a mutable argument.
This commit is contained in:
parent
91dea4a5d8
commit
e4bb04dbf6
@ -243,14 +243,23 @@ class Parser:
|
||||
options=chrome_options,
|
||||
)
|
||||
|
||||
def parse_page(self, url, processed_pages={}, index=None):
|
||||
def parse_page(self, url: str, index: str = None):
|
||||
"""Parse page at url and write it to file, then recursively parse all subpages.
|
||||
|
||||
Args:
|
||||
url (str): URL of the page to parse.
|
||||
index (str, optional): URL of the index page. Defaults to None.
|
||||
|
||||
After the page at `url` has been parsed, calls itself recursively for every subpage
|
||||
it has discovered.
|
||||
"""
|
||||
log.info(f"Parsing page '{url}'")
|
||||
log.debug(f"Using page config: {self.get_page_config(url)}")
|
||||
|
||||
try:
|
||||
self.load(url)
|
||||
if not index:
|
||||
# if this is the first page being parse, set it as the index.html
|
||||
# if this is the first page being parsed, set it as the index.html
|
||||
index = url
|
||||
# if dark theme is enabled, set local storage item and re-load the page
|
||||
if self.args.get("dark_theme", True):
|
||||
@ -595,7 +604,7 @@ class Parser:
|
||||
a["href"] = "#" + sub_page_href_tokens[-1]
|
||||
a["class"] = a.get("class", []) + ["loconotion-anchor-link"]
|
||||
if (
|
||||
sub_page_href in processed_pages.keys()
|
||||
sub_page_href in self.processed_pages.keys()
|
||||
or sub_page_href in sub_pages
|
||||
):
|
||||
log.debug(
|
||||
@ -625,11 +634,10 @@ class Parser:
|
||||
style['cursor'] = "default"
|
||||
child['style'] = style.cssText
|
||||
|
||||
|
||||
# exports the parsed page
|
||||
html_str = str(soup)
|
||||
html_file = self.get_page_slug(url) if url != index else "index.html"
|
||||
if html_file in processed_pages.values():
|
||||
if html_file in self.processed_pages.values():
|
||||
log.error(
|
||||
f"Found duplicate pages with slug '{html_file}' - previous one will be"
|
||||
" overwritten. Make sure that your notion pages names or custom slugs"
|
||||
@ -638,20 +646,15 @@ class Parser:
|
||||
log.info(f"Exporting page '{url}' as '{html_file}'")
|
||||
with open(self.dist_folder / html_file, "wb") as f:
|
||||
f.write(html_str.encode("utf-8").strip())
|
||||
processed_pages[url] = html_file
|
||||
self.processed_pages[url] = html_file
|
||||
|
||||
# parse sub-pages
|
||||
if sub_pages and not self.args.get("single_page", False):
|
||||
if processed_pages:
|
||||
log.debug(f"Pages processed so far: {len(processed_pages)}")
|
||||
if self.processed_pages:
|
||||
log.debug(f"Pages processed so far: {len(self.processed_pages)}")
|
||||
for sub_page in sub_pages:
|
||||
if not sub_page in processed_pages.keys():
|
||||
self.parse_page(
|
||||
sub_page, processed_pages=processed_pages, index=index
|
||||
)
|
||||
|
||||
# we're all done!
|
||||
return processed_pages
|
||||
if not sub_page in self.processed_pages.keys():
|
||||
self.parse_page(sub_page, index=index)
|
||||
|
||||
def load(self, url):
|
||||
self.driver.get(url)
|
||||
@ -659,14 +662,14 @@ class Parser:
|
||||
|
||||
def run(self, url):
|
||||
start_time = time.time()
|
||||
tot_processed_pages = self.parse_page(url)
|
||||
self.processed_pages = {}
|
||||
self.parse_page(url)
|
||||
elapsed_time = time.time() - start_time
|
||||
formatted_time = "{:02d}:{:02d}:{:02d}".format(
|
||||
int(elapsed_time // 3600),
|
||||
int(elapsed_time % 3600 // 60),
|
||||
int(elapsed_time % 60),
|
||||
tot_processed_pages,
|
||||
)
|
||||
log.info(
|
||||
f"Finished!\n\nProcessed {len(tot_processed_pages)} pages in {formatted_time}"
|
||||
f"Finished!\n\nProcessed {len(self.processed_pages)} pages in {formatted_time}"
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user