diff --git a/loconotion/__main__.py b/loconotion/__main__.py index eaea815..78ec90b 100644 --- a/loconotion/__main__.py +++ b/loconotion/__main__.py @@ -16,8 +16,7 @@ try: import toml except ModuleNotFoundError as error: - log.critical( - f"ModuleNotFoundError: {error}. Have you installed the requirements?") + log.critical(f"ModuleNotFoundError: {error}. Have you installed the requirements?") sys.exit() @@ -43,7 +42,9 @@ def get_args(): help="Use a specific chromedriver executable instead of the auto-installing one", ) argparser.add_argument( - "--single-page", action="store_true", help="Only parse the first page, then stop" + "--single-page", + action="store_true", + help="Only parse the first page, then stop", ) argparser.add_argument( "--dark-theme", @@ -133,7 +134,7 @@ def init_parser(args, log): try: requests.get(args.target) except requests.ConnectionError as exception: - log.critical('Connection error') + log.critical("Connection error") if "notion.so" in args.target or "notion.site" in args.target: log.info("Initialising parser with simple page url") @@ -145,12 +146,12 @@ def init_parser(args, log): elif Path(args.target).is_file(): with open(args.target, encoding="utf-8") as f: parsed_config = toml.loads(f.read()) - log.info('Initialising parser with configuration file') + log.info("Initialising parser with configuration file") log.debug(parsed_config) parser = Parser(config=parsed_config, args=vars(args)) else: - log.critical(f"Config file {args.target} does not exists") + log.critical(f"Config file {args.target} does not exist") except FileNotFoundError as e: log.critical(f"FileNotFoundError: {e}") diff --git a/loconotion/notionparser.py b/loconotion/notionparser.py index 1f5c854..6686b1f 100644 --- a/loconotion/notionparser.py +++ b/loconotion/notionparser.py @@ -181,8 +181,10 @@ class Parser: content_type = response.headers.get("content-type") if content_type: file_extension = mimetypes.guess_extension(content_type) - elif '%3f' in file_extension.lower(): - file_extension = re.split("%3f", file_extension, flags=re.IGNORECASE)[0] + elif "%3f" in file_extension.lower(): + file_extension = re.split( + "%3f", file_extension, flags=re.IGNORECASE + )[0] destination = destination.with_suffix(file_extension) Path(destination).parent.mkdir(parents=True, exist_ok=True) @@ -228,8 +230,8 @@ class Parser: if not self.args.get("non_headless", False): chrome_options.add_argument("--headless") chrome_options.add_argument("window-size=1920,1080") - chrome_options.add_argument('--no-sandbox') - chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--log-level=3") chrome_options.add_argument("--silent") chrome_options.add_argument("--disable-logging") @@ -302,14 +304,18 @@ class Parser: # if dark theme is enabled, set local storage item and re-load the page if self.args.get("dark_theme", True): log.debug("Dark theme is enabled") - self.driver.execute_script("window.localStorage.setItem('theme','{\"mode\":\"dark\"}');") + self.driver.execute_script( + "window.localStorage.setItem('theme','{\"mode\":\"dark\"}');" + ) self.load(url) # light theme is on by default # enable dark mode based on https://fruitionsite.com/ dark mode hack - if self.config.get('theme') == 'dark': - self.driver.execute_script("__console.environment.ThemeStore.setState({ mode: 'dark' });") -\ + if self.config.get("theme") == "dark": + self.driver.execute_script( + "__console.environment.ThemeStore.setState({ mode: 'dark' });" + ) + def scroll_to_the_bottom(self): # scroll at the bottom of the notion-scroller element to load all elements # continue once there are no changes in height after a timeout @@ -392,7 +398,9 @@ class Parser: vendors_css.decompose() # collection selectors (List, Gallery, etc.) don't work, so remove them - for collection_selector in soup.findAll("div", {"class": "notion-collection-view-select"}): + for collection_selector in soup.findAll( + "div", {"class": "notion-collection-view-select"} + ): collection_selector.decompose() # clean up the default notion meta tags @@ -456,10 +464,10 @@ class Parser: style = cssutils.parseStyle(img["style"]) spritesheet = style["background"] spritesheet_url = spritesheet[ - spritesheet.find("(") + 1: spritesheet.find(")") - ] + spritesheet.find("(") + 1 : spritesheet.find(")") + ] cached_spritesheet_url = self.cache_file( - f'https://www.notion.so{spritesheet_url}' + f"https://www.notion.so{spritesheet_url}" ) style["background"] = spritesheet.replace( @@ -474,10 +482,14 @@ class Parser: # we don't need the vendors stylesheet if "vendors~" in link["href"]: continue - cached_css_file = self.cache_file(f'https://www.notion.so{link["href"]}') + cached_css_file = self.cache_file( + f'https://www.notion.so{link["href"]}' + ) # files in the css file might be reference with a relative path, # so store the path of the current css file - parent_css_path = os.path.split(urllib.parse.urlparse(link["href"]).path)[0] + parent_css_path = os.path.split( + urllib.parse.urlparse(link["href"]).path + )[0] # open the locally saved file with open(self.dist_folder / cached_css_file, "rb+") as f: stylesheet = cssutils.parseString(f.read()) @@ -490,9 +502,19 @@ class Parser: rule.style["src"].split("url(")[-1].split(")")[0] ) # assemble the url given the current css path - font_url = "/".join(p.strip("/") for p in ["https://www.notion.so", parent_css_path, font_file] if p.strip("/")) + font_url = "/".join( + p.strip("/") + for p in [ + "https://www.notion.so", + parent_css_path, + font_file, + ] + if p.strip("/") + ) # don't hash the font files filenames, rather get filename only - cached_font_file = self.cache_file(font_url, Path(font_file).name) + cached_font_file = self.cache_file( + font_url, Path(font_file).name + ) rule.style["src"] = f"url({cached_font_file})" # commit stylesheet edits to file f.seek(0) @@ -526,14 +548,20 @@ class Parser: # the link to the row item is equal to its data-block-id without dashes for table_view in soup.findAll("div", {"class": "notion-table-view"}): for table_row in table_view.findAll( - "div", {"class": "notion-collection-item"} + "div", {"class": "notion-collection-item"} ): table_row_block_id = table_row["data-block-id"] table_row_href = "/" + table_row_block_id.replace("-", "") row_target_span = table_row.find("span") - row_target_span["style"] = row_target_span["style"].replace("pointer-events: none;","") + row_target_span["style"] = row_target_span["style"].replace( + "pointer-events: none;", "" + ) row_link_wrapper = soup.new_tag( - "a", attrs={"href": table_row_href, "style": "cursor: pointer; color: inherit; text-decoration: none; fill: inherit;"} + "a", + attrs={ + "href": table_row_href, + "style": "cursor: pointer; color: inherit; text-decoration: none; fill: inherit;", + }, ) row_target_span.wrap(row_link_wrapper) @@ -609,7 +637,7 @@ class Parser: # destination = (self.dist_folder / source.name) # shutil.copyfile(source, destination) injected_tag[attr] = str(cached_custom_file) # source.name - log.debug(f'Injecting <{section}> tag: {injected_tag}') + log.debug(f"Injecting <{section}> tag: {injected_tag}") soup.find(section).append(injected_tag) def inject_loconotion_script_and_css(self, soup): @@ -629,23 +657,27 @@ class Parser: # find sub-pages and clean slugs / links subpages = [] parse_links = not self.get_page_config(url).get("no-links", False) - for a in soup.find_all('a', href=True): + for a in soup.find_all("a", href=True): sub_page_href = a["href"] if sub_page_href.startswith("/"): - sub_page_href = f'{hrefDomain}/{a["href"].split("/")[len(a["href"].split("/"))-1]}' + sub_page_href = ( + f'{hrefDomain}/{a["href"].split("/")[len(a["href"].split("/"))-1]}' + ) log.info(f"Got this as href {sub_page_href}") if sub_page_href.startswith(hrefDomain): - if parse_links or not len(a.find_parents("div", class_="notion-scroller")): + if parse_links or not len( + a.find_parents("div", class_="notion-scroller") + ): # if the link is an anchor link, # check if the page hasn't already been parsed if "#" in sub_page_href: sub_page_href_tokens = sub_page_href.split("#") sub_page_href = sub_page_href_tokens[0] - a["href"] = f'#{sub_page_href_tokens[-1]}' + a["href"] = f"#{sub_page_href_tokens[-1]}" a["class"] = a.get("class", []) + ["loconotion-anchor-link"] if ( - sub_page_href in self.processed_pages.keys() - or sub_page_href in subpages + sub_page_href in self.processed_pages.keys() + or sub_page_href in subpages ): log.debug( f"Original page for anchor link {sub_page_href}" @@ -668,11 +700,11 @@ class Parser: del a["href"] a.name = "span" # remove pointer cursor styling on the link and all children - for child in ([a] + a.find_all()): - if (child.has_attr("style")): - style = cssutils.parseStyle(child['style']) - style['cursor'] = "default" - child['style'] = style.cssText + for child in [a] + a.find_all(): + if child.has_attr("style"): + style = cssutils.parseStyle(child["style"]) + style["cursor"] = "default" + child["style"] = style.cssText return subpages def export_parsed_page(self, url, index, soup):