Apply black formatter

This commit is contained in:
Alexey Leshchenko 2022-02-18 12:46:08 +03:00
parent c8a9dcbcd7
commit ffc96882ed
2 changed files with 70 additions and 37 deletions

View File

@ -16,8 +16,7 @@ try:
import toml
except ModuleNotFoundError as error:
log.critical(
f"ModuleNotFoundError: {error}. Have you installed the requirements?")
log.critical(f"ModuleNotFoundError: {error}. Have you installed the requirements?")
sys.exit()
@ -43,7 +42,9 @@ def get_args():
help="Use a specific chromedriver executable instead of the auto-installing one",
)
argparser.add_argument(
"--single-page", action="store_true", help="Only parse the first page, then stop"
"--single-page",
action="store_true",
help="Only parse the first page, then stop",
)
argparser.add_argument(
"--dark-theme",
@ -133,7 +134,7 @@ def init_parser(args, log):
try:
requests.get(args.target)
except requests.ConnectionError as exception:
log.critical('Connection error')
log.critical("Connection error")
if "notion.so" in args.target or "notion.site" in args.target:
log.info("Initialising parser with simple page url")
@ -145,12 +146,12 @@ def init_parser(args, log):
elif Path(args.target).is_file():
with open(args.target, encoding="utf-8") as f:
parsed_config = toml.loads(f.read())
log.info('Initialising parser with configuration file')
log.info("Initialising parser with configuration file")
log.debug(parsed_config)
parser = Parser(config=parsed_config, args=vars(args))
else:
log.critical(f"Config file {args.target} does not exists")
log.critical(f"Config file {args.target} does not exist")
except FileNotFoundError as e:
log.critical(f"FileNotFoundError: {e}")

View File

@ -181,8 +181,10 @@ class Parser:
content_type = response.headers.get("content-type")
if content_type:
file_extension = mimetypes.guess_extension(content_type)
elif '%3f' in file_extension.lower():
file_extension = re.split("%3f", file_extension, flags=re.IGNORECASE)[0]
elif "%3f" in file_extension.lower():
file_extension = re.split(
"%3f", file_extension, flags=re.IGNORECASE
)[0]
destination = destination.with_suffix(file_extension)
Path(destination).parent.mkdir(parents=True, exist_ok=True)
@ -228,8 +230,8 @@ class Parser:
if not self.args.get("non_headless", False):
chrome_options.add_argument("--headless")
chrome_options.add_argument("window-size=1920,1080")
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--silent")
chrome_options.add_argument("--disable-logging")
@ -302,14 +304,18 @@ class Parser:
# if dark theme is enabled, set local storage item and re-load the page
if self.args.get("dark_theme", True):
log.debug("Dark theme is enabled")
self.driver.execute_script("window.localStorage.setItem('theme','{\"mode\":\"dark\"}');")
self.driver.execute_script(
"window.localStorage.setItem('theme','{\"mode\":\"dark\"}');"
)
self.load(url)
# light theme is on by default
# enable dark mode based on https://fruitionsite.com/ dark mode hack
if self.config.get('theme') == 'dark':
self.driver.execute_script("__console.environment.ThemeStore.setState({ mode: 'dark' });")
\
if self.config.get("theme") == "dark":
self.driver.execute_script(
"__console.environment.ThemeStore.setState({ mode: 'dark' });"
)
def scroll_to_the_bottom(self):
# scroll at the bottom of the notion-scroller element to load all elements
# continue once there are no changes in height after a timeout
@ -392,7 +398,9 @@ class Parser:
vendors_css.decompose()
# collection selectors (List, Gallery, etc.) don't work, so remove them
for collection_selector in soup.findAll("div", {"class": "notion-collection-view-select"}):
for collection_selector in soup.findAll(
"div", {"class": "notion-collection-view-select"}
):
collection_selector.decompose()
# clean up the default notion meta tags
@ -459,7 +467,7 @@ class Parser:
spritesheet.find("(") + 1 : spritesheet.find(")")
]
cached_spritesheet_url = self.cache_file(
f'https://www.notion.so{spritesheet_url}'
f"https://www.notion.so{spritesheet_url}"
)
style["background"] = spritesheet.replace(
@ -474,10 +482,14 @@ class Parser:
# we don't need the vendors stylesheet
if "vendors~" in link["href"]:
continue
cached_css_file = self.cache_file(f'https://www.notion.so{link["href"]}')
cached_css_file = self.cache_file(
f'https://www.notion.so{link["href"]}'
)
# files in the css file might be reference with a relative path,
# so store the path of the current css file
parent_css_path = os.path.split(urllib.parse.urlparse(link["href"]).path)[0]
parent_css_path = os.path.split(
urllib.parse.urlparse(link["href"]).path
)[0]
# open the locally saved file
with open(self.dist_folder / cached_css_file, "rb+") as f:
stylesheet = cssutils.parseString(f.read())
@ -490,9 +502,19 @@ class Parser:
rule.style["src"].split("url(")[-1].split(")")[0]
)
# assemble the url given the current css path
font_url = "/".join(p.strip("/") for p in ["https://www.notion.so", parent_css_path, font_file] if p.strip("/"))
font_url = "/".join(
p.strip("/")
for p in [
"https://www.notion.so",
parent_css_path,
font_file,
]
if p.strip("/")
)
# don't hash the font files filenames, rather get filename only
cached_font_file = self.cache_file(font_url, Path(font_file).name)
cached_font_file = self.cache_file(
font_url, Path(font_file).name
)
rule.style["src"] = f"url({cached_font_file})"
# commit stylesheet edits to file
f.seek(0)
@ -531,9 +553,15 @@ class Parser:
table_row_block_id = table_row["data-block-id"]
table_row_href = "/" + table_row_block_id.replace("-", "")
row_target_span = table_row.find("span")
row_target_span["style"] = row_target_span["style"].replace("pointer-events: none;","")
row_target_span["style"] = row_target_span["style"].replace(
"pointer-events: none;", ""
)
row_link_wrapper = soup.new_tag(
"a", attrs={"href": table_row_href, "style": "cursor: pointer; color: inherit; text-decoration: none; fill: inherit;"}
"a",
attrs={
"href": table_row_href,
"style": "cursor: pointer; color: inherit; text-decoration: none; fill: inherit;",
},
)
row_target_span.wrap(row_link_wrapper)
@ -609,7 +637,7 @@ class Parser:
# destination = (self.dist_folder / source.name)
# shutil.copyfile(source, destination)
injected_tag[attr] = str(cached_custom_file) # source.name
log.debug(f'Injecting <{section}> tag: {injected_tag}')
log.debug(f"Injecting <{section}> tag: {injected_tag}")
soup.find(section).append(injected_tag)
def inject_loconotion_script_and_css(self, soup):
@ -629,19 +657,23 @@ class Parser:
# find sub-pages and clean slugs / links
subpages = []
parse_links = not self.get_page_config(url).get("no-links", False)
for a in soup.find_all('a', href=True):
for a in soup.find_all("a", href=True):
sub_page_href = a["href"]
if sub_page_href.startswith("/"):
sub_page_href = f'{hrefDomain}/{a["href"].split("/")[len(a["href"].split("/"))-1]}'
sub_page_href = (
f'{hrefDomain}/{a["href"].split("/")[len(a["href"].split("/"))-1]}'
)
log.info(f"Got this as href {sub_page_href}")
if sub_page_href.startswith(hrefDomain):
if parse_links or not len(a.find_parents("div", class_="notion-scroller")):
if parse_links or not len(
a.find_parents("div", class_="notion-scroller")
):
# if the link is an anchor link,
# check if the page hasn't already been parsed
if "#" in sub_page_href:
sub_page_href_tokens = sub_page_href.split("#")
sub_page_href = sub_page_href_tokens[0]
a["href"] = f'#{sub_page_href_tokens[-1]}'
a["href"] = f"#{sub_page_href_tokens[-1]}"
a["class"] = a.get("class", []) + ["loconotion-anchor-link"]
if (
sub_page_href in self.processed_pages.keys()
@ -668,11 +700,11 @@ class Parser:
del a["href"]
a.name = "span"
# remove pointer cursor styling on the link and all children
for child in ([a] + a.find_all()):
if (child.has_attr("style")):
style = cssutils.parseStyle(child['style'])
style['cursor'] = "default"
child['style'] = style.cssText
for child in [a] + a.find_all():
if child.has_attr("style"):
style = cssutils.parseStyle(child["style"])
style["cursor"] = "default"
child["style"] = style.cssText
return subpages
def export_parsed_page(self, url, index, soup):