Make processed_pages an attribute of Parser

This removes the dangerous practice of passing
function results through a mutable argument.
This commit is contained in:
Alexey Leshchenko 2022-02-18 11:05:09 +03:00
parent 91dea4a5d8
commit e4bb04dbf6

View File

@ -243,14 +243,23 @@ class Parser:
options=chrome_options, options=chrome_options,
) )
def parse_page(self, url, processed_pages={}, index=None): def parse_page(self, url: str, index: str = None):
"""Parse page at url and write it to file, then recursively parse all subpages.
Args:
url (str): URL of the page to parse.
index (str, optional): URL of the index page. Defaults to None.
After the page at `url` has been parsed, calls itself recursively for every subpage
it has discovered.
"""
log.info(f"Parsing page '{url}'") log.info(f"Parsing page '{url}'")
log.debug(f"Using page config: {self.get_page_config(url)}") log.debug(f"Using page config: {self.get_page_config(url)}")
try: try:
self.load(url) self.load(url)
if not index: if not index:
# if this is the first page being parse, set it as the index.html # if this is the first page being parsed, set it as the index.html
index = url index = url
# if dark theme is enabled, set local storage item and re-load the page # if dark theme is enabled, set local storage item and re-load the page
if self.args.get("dark_theme", True): if self.args.get("dark_theme", True):
@ -595,7 +604,7 @@ class Parser:
a["href"] = "#" + sub_page_href_tokens[-1] a["href"] = "#" + sub_page_href_tokens[-1]
a["class"] = a.get("class", []) + ["loconotion-anchor-link"] a["class"] = a.get("class", []) + ["loconotion-anchor-link"]
if ( if (
sub_page_href in processed_pages.keys() sub_page_href in self.processed_pages.keys()
or sub_page_href in sub_pages or sub_page_href in sub_pages
): ):
log.debug( log.debug(
@ -625,11 +634,10 @@ class Parser:
style['cursor'] = "default" style['cursor'] = "default"
child['style'] = style.cssText child['style'] = style.cssText
# exports the parsed page # exports the parsed page
html_str = str(soup) html_str = str(soup)
html_file = self.get_page_slug(url) if url != index else "index.html" html_file = self.get_page_slug(url) if url != index else "index.html"
if html_file in processed_pages.values(): if html_file in self.processed_pages.values():
log.error( log.error(
f"Found duplicate pages with slug '{html_file}' - previous one will be" f"Found duplicate pages with slug '{html_file}' - previous one will be"
" overwritten. Make sure that your notion pages names or custom slugs" " overwritten. Make sure that your notion pages names or custom slugs"
@ -638,20 +646,15 @@ class Parser:
log.info(f"Exporting page '{url}' as '{html_file}'") log.info(f"Exporting page '{url}' as '{html_file}'")
with open(self.dist_folder / html_file, "wb") as f: with open(self.dist_folder / html_file, "wb") as f:
f.write(html_str.encode("utf-8").strip()) f.write(html_str.encode("utf-8").strip())
processed_pages[url] = html_file self.processed_pages[url] = html_file
# parse sub-pages # parse sub-pages
if sub_pages and not self.args.get("single_page", False): if sub_pages and not self.args.get("single_page", False):
if processed_pages: if self.processed_pages:
log.debug(f"Pages processed so far: {len(processed_pages)}") log.debug(f"Pages processed so far: {len(self.processed_pages)}")
for sub_page in sub_pages: for sub_page in sub_pages:
if not sub_page in processed_pages.keys(): if not sub_page in self.processed_pages.keys():
self.parse_page( self.parse_page(sub_page, index=index)
sub_page, processed_pages=processed_pages, index=index
)
# we're all done!
return processed_pages
def load(self, url): def load(self, url):
self.driver.get(url) self.driver.get(url)
@ -659,14 +662,14 @@ class Parser:
def run(self, url): def run(self, url):
start_time = time.time() start_time = time.time()
tot_processed_pages = self.parse_page(url) self.processed_pages = {}
self.parse_page(url)
elapsed_time = time.time() - start_time elapsed_time = time.time() - start_time
formatted_time = "{:02d}:{:02d}:{:02d}".format( formatted_time = "{:02d}:{:02d}:{:02d}".format(
int(elapsed_time // 3600), int(elapsed_time // 3600),
int(elapsed_time % 3600 // 60), int(elapsed_time % 3600 // 60),
int(elapsed_time % 60), int(elapsed_time % 60),
tot_processed_pages,
) )
log.info( log.info(
f"Finished!\n\nProcessed {len(tot_processed_pages)} pages in {formatted_time}" f"Finished!\n\nProcessed {len(self.processed_pages)} pages in {formatted_time}"
) )