diff --git a/loconotion/notionparser.py b/loconotion/notionparser.py index 3d6dba8..7f1101e 100644 --- a/loconotion/notionparser.py +++ b/loconotion/notionparser.py @@ -256,28 +256,63 @@ class Parser: log.info(f"Parsing page '{url}'") log.debug(f"Using page config: {self.get_page_config(url)}") + if not index: # if this is the first page being parsed + index = url # set it as the index.html + try: - self.load(url) - if not index: - # if this is the first page being parsed, set it as the index.html - index = url - # if dark theme is enabled, set local storage item and re-load the page - if self.args.get("dark_theme", True): - log.debug(f"Dark theme is enabled") - self.driver.execute_script("window.localStorage.setItem('theme','{\"mode\":\"dark\"}');") - self.load(url) - except TimeoutException as ex: + self.load_correct_theme(url) + except TimeoutException: log.critical( "Timeout waiting for page content to load, or no content found." " Are you sure the page is set to public?" ) return + self.scroll_to_the_bottom() + + # open the toggle blocks in the page + self.open_toggle_blocks(self.args["timeout"]) + + # creates soup from the page to start parsing + soup = BeautifulSoup(self.driver.page_source, "html.parser") + + self.clean_up(soup) + self.set_custom_meta_tags(url, soup) + self.process_images_and_emojis(soup) + self.process_stylesheets(soup) + self.add_toggle_custom_logic(soup) + self.process_table_views(soup) + self.embed_custom_fonts(url, soup) + + # inject any custom elements to the page + custom_injects = self.get_page_config(url).get("inject", {}) + self.inject_custom_tags("head", soup, custom_injects) + self.inject_custom_tags("body", soup, custom_injects) + + self.inject_loconotion_script_and_css(soup) + + hrefDomain = f'{url.split("notion.site")[0]}notion.site' + log.info(f"Got the domain as {hrefDomain}") + + subpages = self.find_subpages(url, index, soup, hrefDomain) + self.export_parsed_page(url, index, soup) + self.parse_subpages(index, subpages) + + def load_correct_theme(self, url): + self.load(url) + + # if dark theme is enabled, set local storage item and re-load the page + if self.args.get("dark_theme", True): + log.debug("Dark theme is enabled") + self.driver.execute_script("window.localStorage.setItem('theme','{\"mode\":\"dark\"}');") + self.load(url) + # light theme is on by default # enable dark mode based on https://fruitionsite.com/ dark mode hack if self.config.get('theme') == 'dark': self.driver.execute_script("__console.environment.ThemeStore.setState({ mode: 'dark' });") - +\ + def scroll_to_the_bottom(self): # scroll at the bottom of the notion-scroller element to load all elements # continue once there are no changes in height after a timeout # don't do this if the page has a calendar databse on it or it will load forever @@ -299,302 +334,6 @@ class Parser: break last_height = new_height - # open the toggle blocks in the page - self.open_toggle_blocks(self.args["timeout"]) - - # creates soup from the page to start parsing - soup = BeautifulSoup(self.driver.page_source, "html.parser") - - # remove scripts and other tags we don't want / need - for unwanted in soup.findAll("script"): - unwanted.decompose() - for intercom_frame in soup.findAll("iframe", {"id": "intercom-frame"}): - intercom_frame.decompose() - for intercom_div in soup.findAll("div", {"class": "intercom-lightweight-app"}): - intercom_div.decompose() - for overlay_div in soup.findAll("div", {"class": "notion-overlay-container"}): - overlay_div.decompose() - for vendors_css in soup.find_all("link", href=lambda x: x and "vendors~" in x): - vendors_css.decompose() - - # collection selectors (List, Gallery, etc.) don't work, so remove them - for collection_selector in soup.findAll("div", {"class": "notion-collection-view-select"}): - collection_selector.decompose() - - # clean up the default notion meta tags - for tag in [ - "description", - "twitter:card", - "twitter:site", - "twitter:title", - "twitter:description", - "twitter:image", - "twitter:url", - "apple-itunes-app", - ]: - unwanted_tag = soup.find("meta", attrs={"name": tag}) - if unwanted_tag: - unwanted_tag.decompose() - for tag in [ - "og:site_name", - "og:type", - "og:url", - "og:title", - "og:description", - "og:image", - ]: - unwanted_og_tag = soup.find("meta", attrs={"property": tag}) - if unwanted_og_tag: - unwanted_og_tag.decompose() - - # set custom meta tags - custom_meta_tags = self.get_page_config(url).get("meta", []) - for custom_meta_tag in custom_meta_tags: - tag = soup.new_tag("meta") - for attr, value in custom_meta_tag.items(): - tag.attrs[attr] = value - log.debug(f"Adding meta tag {str(tag)}") - soup.head.append(tag) - - # process images & emojis - cache_images = True - for img in soup.findAll("img"): - if img.has_attr("src"): - if cache_images and not "data:image" in img["src"]: - img_src = img["src"] - # if the path starts with /, it's one of notion's predefined images - if img["src"].startswith("/"): - img_src = "https://www.notion.so" + img["src"] - # notion's own default images urls are in a weird format, need to sanitize them - # img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0] - # if (not '.amazonaws' in img_src): - # img_src = urllib.parse.unquote(img_src) - - cached_image = self.cache_file(img_src) - img["src"] = cached_image - else: - if img["src"].startswith("/"): - img["src"] = "https://www.notion.so" + img["src"] - - # on emoji images, cache their sprite sheet and re-set their background url - if img.has_attr("class") and "notion-emoji" in img["class"]: - style = cssutils.parseStyle(img["style"]) - spritesheet = style["background"] - spritesheet_url = spritesheet[ - spritesheet.find("(") + 1: spritesheet.find(")") - ] - cached_spritesheet_url = self.cache_file( - "https://www.notion.so" + spritesheet_url - ) - style["background"] = spritesheet.replace( - spritesheet_url, str(cached_spritesheet_url) - ) - img["style"] = style.cssText - - # process stylesheets - for link in soup.findAll("link", rel="stylesheet"): - if link.has_attr("href") and link["href"].startswith("/"): - # we don't need the vendors stylesheet - if "vendors~" in link["href"]: - continue - cached_css_file = self.cache_file("https://www.notion.so" + link["href"]) - # files in the css file might be reference with a relative path, - # so store the path of the current css file - parent_css_path = os.path.split(urllib.parse.urlparse(link["href"]).path)[0] - # open the locally saved file - with open(self.dist_folder / cached_css_file, "rb+") as f: - stylesheet = cssutils.parseString(f.read()) - # open the stylesheet and check for any font-face rule, - for rule in stylesheet.cssRules: - if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE: - # if any are found, download the font file - # TODO: maths fonts have fallback font sources - font_file = ( - rule.style["src"].split("url(")[-1].split(")")[0] - ) - # assemble the url given the current css path - font_url = "/".join(p.strip("/") for p in ["https://www.notion.so", parent_css_path, font_file] if p.strip("/")) - # don't hash the font files filenames, rather get filename only - cached_font_file = self.cache_file(font_url, Path(font_file).name) - rule.style["src"] = f"url({cached_font_file})" - # commit stylesheet edits to file - f.seek(0) - f.truncate() - f.write(stylesheet.cssText) - - link["href"] = str(cached_css_file) - - # add our custom logic to all toggle blocks - for toggle_block in soup.findAll("div", {"class": "notion-toggle-block"}): - toggle_id = uuid.uuid4() - toggle_button = toggle_block.select_one("div[role=button]") - toggle_content = toggle_block.find("div", {"class": None, "style": ""}) - if toggle_button and toggle_content: - # add a custom class to the toggle button and content, - # plus a custom attribute sharing a unique uiid so - # we can hook them up with some custom js logic later - toggle_button["class"] = toggle_block.get("class", []) + [ - "loconotion-toggle-button" - ] - toggle_content["class"] = toggle_content.get("class", []) + [ - "loconotion-toggle-content" - ] - toggle_content.attrs["loconotion-toggle-id"] = toggle_button.attrs[ - "loconotion-toggle-id" - ] = toggle_id - - # if there are any table views in the page, add links to the title rows - # the link to the row item is equal to its data-block-id without dashes - for table_view in soup.findAll("div", {"class": "notion-table-view"}): - for table_row in table_view.findAll( - "div", {"class": "notion-collection-item"} - ): - table_row_block_id = table_row["data-block-id"] - table_row_href = "/" + table_row_block_id.replace("-", "") - row_target_span = table_row.find("span") - row_target_span["style"] = row_target_span["style"].replace("pointer-events: none;","") - row_link_wrapper = soup.new_tag( - "a", attrs={"href": table_row_href, "style": "cursor: pointer; color: inherit; text-decoration: none; fill: inherit;"} - ) - row_target_span.wrap(row_link_wrapper) - - # embed custom google font(s) - fonts_selectors = { - "site": "div:not(.notion-code-block)", - "navbar": ".notion-topbar div", - "title": ".notion-page-block > div, .notion-collection_view_page-block > div[data-root]", - "h1": ".notion-header-block div, notion-page-content > notion-collection_view-block > div:first-child div", - "h2": ".notion-sub_header-block div", - "h3": ".notion-sub_sub_header-block div", - "body": ".notion-scroller", - "code": ".notion-code-block *", - } - custom_fonts = self.get_page_config(url).get("fonts", {}) - if custom_fonts: - # append a stylesheet importing the google font for each unique font - unique_custom_fonts = set(custom_fonts.values()) - for font in unique_custom_fonts: - if font: - google_fonts_embed_name = font.replace(" ", "+") - font_href = f"https://fonts.googleapis.com/css2?family={google_fonts_embed_name}:wght@500;600;700&display=swap" - custom_font_stylesheet = soup.new_tag( - "link", rel="stylesheet", href=font_href - ) - soup.head.append(custom_font_stylesheet) - - # go through each custom font, and add a css rule overriding the font-family - # to the font override stylesheet targetting the appropriate selector - font_override_stylesheet = soup.new_tag("style", type="text/css") - for target, custom_font in custom_fonts.items(): - if custom_font and not target == "site": - log.debug(f"Setting {target} font-family to {custom_font}") - font_override_stylesheet.append( - fonts_selectors[target] - + " {font-family:" - + custom_font - + " !important} " - ) - site_font = custom_fonts.get("site", None) - # process global site font last to more granular settings can override it - if site_font: - log.debug(f"Setting global site font-family to {site_font}"), - font_override_stylesheet.append( - fonts_selectors["site"] + " {font-family:" + site_font + "} " - ) - # finally append the font overrides stylesheets to the page - soup.head.append(font_override_stylesheet) - - # inject any custom elements to the page - custom_injects = self.get_page_config(url).get("inject", {}) - - self.inject_custom_tags("head", soup, custom_injects) - self.inject_custom_tags("body", soup, custom_injects) - - # inject loconotion's custom stylesheet and script - loconotion_custom_css = self.cache_file(Path("bundles/loconotion.css")) - custom_css = soup.new_tag( - "link", rel="stylesheet", href=str(loconotion_custom_css) - ) - soup.head.insert(-1, custom_css) - loconotion_custom_js = self.cache_file(Path("bundles/loconotion.js")) - custom_script = soup.new_tag( - "script", type="text/javascript", src=str(loconotion_custom_js) - ) - soup.body.insert(-1, custom_script) - - hrefDomain = url.split('notion.site')[0] + 'notion.site' - log.info(f"Got the domain as {hrefDomain}") - - # find sub-pages and clean slugs / links - sub_pages = [] - parse_links = not self.get_page_config(url).get("no-links", False) - for a in soup.find_all('a', href=True): - sub_page_href = a["href"] - if sub_page_href.startswith("/"): - sub_page_href = hrefDomain + '/'+ a["href"].split('/')[len(a["href"].split('/'))-1] - log.info(f"Got this as href {sub_page_href}") - if sub_page_href.startswith(hrefDomain): - if parse_links or not len(a.find_parents("div", class_="notion-scroller")): - # if the link is an anchor link, - # check if the page hasn't already been parsed - if "#" in sub_page_href: - sub_page_href_tokens = sub_page_href.split("#") - sub_page_href = sub_page_href_tokens[0] - a["href"] = "#" + sub_page_href_tokens[-1] - a["class"] = a.get("class", []) + ["loconotion-anchor-link"] - if ( - sub_page_href in self.processed_pages.keys() - or sub_page_href in sub_pages - ): - log.debug( - f"Original page for anchor link {sub_page_href}" - " already parsed / pending parsing, skipping" - ) - continue - else: - a["href"] = ( - self.get_page_slug(sub_page_href) - if sub_page_href != index - else "index.html" - ) - sub_pages.append(sub_page_href) - log.debug(f"Found link to page {a['href']}") - else: - # if the page is set not to follow any links, strip the href - # do this only on children of .notion-scroller, we don't want - # to strip the links from the top nav bar - log.debug(f"Stripping link for {a['href']}") - del a["href"] - a.name = "span" - # remove pointer cursor styling on the link and all children - for child in ([a] + a.find_all()): - if (child.has_attr("style")): - style = cssutils.parseStyle(child['style']) - style['cursor'] = "default" - child['style'] = style.cssText - - # exports the parsed page - html_str = str(soup) - html_file = self.get_page_slug(url) if url != index else "index.html" - if html_file in self.processed_pages.values(): - log.error( - f"Found duplicate pages with slug '{html_file}' - previous one will be" - " overwritten. Make sure that your notion pages names or custom slugs" - " in the configuration files are unique" - ) - log.info(f"Exporting page '{url}' as '{html_file}'") - with open(self.dist_folder / html_file, "wb") as f: - f.write(html_str.encode("utf-8").strip()) - self.processed_pages[url] = html_file - - # parse sub-pages - if sub_pages and not self.args.get("single_page", False): - if self.processed_pages: - log.debug(f"Pages processed so far: {len(self.processed_pages)}") - for sub_page in sub_pages: - if not sub_page in self.processed_pages.keys(): - self.parse_page(sub_page, index=index) - def open_toggle_blocks(self, timeout: int, exclude=[]): """Expand all the toggle block in the page to make their content visible @@ -641,6 +380,214 @@ class Parser: # if so, run the function again self.open_toggle_blocks(timeout, opened_toggles) + def clean_up(self, soup): + # remove scripts and other tags we don't want / need + for unwanted in soup.findAll("script"): + unwanted.decompose() + for intercom_frame in soup.findAll("iframe", {"id": "intercom-frame"}): + intercom_frame.decompose() + for intercom_div in soup.findAll("div", {"class": "intercom-lightweight-app"}): + intercom_div.decompose() + for overlay_div in soup.findAll("div", {"class": "notion-overlay-container"}): + overlay_div.decompose() + for vendors_css in soup.find_all("link", href=lambda x: x and "vendors~" in x): + vendors_css.decompose() + + # collection selectors (List, Gallery, etc.) don't work, so remove them + for collection_selector in soup.findAll("div", {"class": "notion-collection-view-select"}): + collection_selector.decompose() + + # clean up the default notion meta tags + for tag in [ + "description", + "twitter:card", + "twitter:site", + "twitter:title", + "twitter:description", + "twitter:image", + "twitter:url", + "apple-itunes-app", + ]: + unwanted_tag = soup.find("meta", attrs={"name": tag}) + if unwanted_tag: + unwanted_tag.decompose() + for tag in [ + "og:site_name", + "og:type", + "og:url", + "og:title", + "og:description", + "og:image", + ]: + unwanted_og_tag = soup.find("meta", attrs={"property": tag}) + if unwanted_og_tag: + unwanted_og_tag.decompose() + + def set_custom_meta_tags(self, url, soup): + # set custom meta tags + custom_meta_tags = self.get_page_config(url).get("meta", []) + for custom_meta_tag in custom_meta_tags: + tag = soup.new_tag("meta") + for attr, value in custom_meta_tag.items(): + tag.attrs[attr] = value + log.debug(f"Adding meta tag {str(tag)}") + soup.head.append(tag) + + def process_images_and_emojis(self, soup): + # process images & emojis + cache_images = True + for img in soup.findAll("img"): + if img.has_attr("src"): + if cache_images and "data:image" not in img["src"]: + img_src = img["src"] + # if the path starts with /, it's one of notion's predefined images + if img["src"].startswith("/"): + img_src = f'https://www.notion.so{img["src"]}' + # notion's own default images urls are in a weird format, need to sanitize them + # img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0] + # if (not '.amazonaws' in img_src): + # img_src = urllib.parse.unquote(img_src) + + cached_image = self.cache_file(img_src) + img["src"] = cached_image + elif img["src"].startswith("/"): + img["src"] = f'https://www.notion.so{img["src"]}' + + # on emoji images, cache their sprite sheet and re-set their background url + if img.has_attr("class") and "notion-emoji" in img["class"]: + style = cssutils.parseStyle(img["style"]) + spritesheet = style["background"] + spritesheet_url = spritesheet[ + spritesheet.find("(") + 1: spritesheet.find(")") + ] + cached_spritesheet_url = self.cache_file( + f'https://www.notion.so{spritesheet_url}' + ) + + style["background"] = spritesheet.replace( + spritesheet_url, str(cached_spritesheet_url) + ) + img["style"] = style.cssText + + def process_stylesheets(self, soup): + # process stylesheets + for link in soup.findAll("link", rel="stylesheet"): + if link.has_attr("href") and link["href"].startswith("/"): + # we don't need the vendors stylesheet + if "vendors~" in link["href"]: + continue + cached_css_file = self.cache_file(f'https://www.notion.so{link["href"]}') + # files in the css file might be reference with a relative path, + # so store the path of the current css file + parent_css_path = os.path.split(urllib.parse.urlparse(link["href"]).path)[0] + # open the locally saved file + with open(self.dist_folder / cached_css_file, "rb+") as f: + stylesheet = cssutils.parseString(f.read()) + # open the stylesheet and check for any font-face rule, + for rule in stylesheet.cssRules: + if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE: + # if any are found, download the font file + # TODO: maths fonts have fallback font sources + font_file = ( + rule.style["src"].split("url(")[-1].split(")")[0] + ) + # assemble the url given the current css path + font_url = "/".join(p.strip("/") for p in ["https://www.notion.so", parent_css_path, font_file] if p.strip("/")) + # don't hash the font files filenames, rather get filename only + cached_font_file = self.cache_file(font_url, Path(font_file).name) + rule.style["src"] = f"url({cached_font_file})" + # commit stylesheet edits to file + f.seek(0) + f.truncate() + f.write(stylesheet.cssText) + + link["href"] = str(cached_css_file) + + def add_toggle_custom_logic(self, soup): + # add our custom logic to all toggle blocks + for toggle_block in soup.findAll("div", {"class": "notion-toggle-block"}): + toggle_id = uuid.uuid4() + toggle_button = toggle_block.select_one("div[role=button]") + toggle_content = toggle_block.find("div", {"class": None, "style": ""}) + if toggle_button and toggle_content: + # add a custom class to the toggle button and content, + # plus a custom attribute sharing a unique uiid so + # we can hook them up with some custom js logic later + toggle_button["class"] = toggle_block.get("class", []) + [ + "loconotion-toggle-button" + ] + toggle_content["class"] = toggle_content.get("class", []) + [ + "loconotion-toggle-content" + ] + toggle_content.attrs["loconotion-toggle-id"] = toggle_button.attrs[ + "loconotion-toggle-id" + ] = toggle_id + + def process_table_views(self, soup): + # if there are any table views in the page, add links to the title rows + # the link to the row item is equal to its data-block-id without dashes + for table_view in soup.findAll("div", {"class": "notion-table-view"}): + for table_row in table_view.findAll( + "div", {"class": "notion-collection-item"} + ): + table_row_block_id = table_row["data-block-id"] + table_row_href = "/" + table_row_block_id.replace("-", "") + row_target_span = table_row.find("span") + row_target_span["style"] = row_target_span["style"].replace("pointer-events: none;","") + row_link_wrapper = soup.new_tag( + "a", attrs={"href": table_row_href, "style": "cursor: pointer; color: inherit; text-decoration: none; fill: inherit;"} + ) + row_target_span.wrap(row_link_wrapper) + + def embed_custom_fonts(self, url, soup): + if not (custom_fonts := self.get_page_config(url).get("fonts", {})): + return + + # append a stylesheet importing the google font for each unique font + unique_custom_fonts = set(custom_fonts.values()) + for font in unique_custom_fonts: + if font: + google_fonts_embed_name = font.replace(" ", "+") + font_href = f"https://fonts.googleapis.com/css2?family={google_fonts_embed_name}:wght@500;600;700&display=swap" + custom_font_stylesheet = soup.new_tag( + "link", rel="stylesheet", href=font_href + ) + soup.head.append(custom_font_stylesheet) + + # go through each custom font, and add a css rule overriding the font-family + # to the font override stylesheet targetting the appropriate selector + font_override_stylesheet = soup.new_tag("style", type="text/css") + # embed custom google font(s) + fonts_selectors = { + "site": "div:not(.notion-code-block)", + "navbar": ".notion-topbar div", + "title": ".notion-page-block > div, .notion-collection_view_page-block > div[data-root]", + "h1": ".notion-header-block div, notion-page-content > notion-collection_view-block > div:first-child div", + "h2": ".notion-sub_header-block div", + "h3": ".notion-sub_sub_header-block div", + "body": ".notion-scroller", + "code": ".notion-code-block *", + } + for target, custom_font in custom_fonts.items(): + if custom_font and target != "site": + log.debug(f"Setting {target} font-family to {custom_font}") + font_override_stylesheet.append( + fonts_selectors[target] + + " {font-family:" + + custom_font + + " !important} " + ) + + site_font = custom_fonts.get("site", None) + if site_font: + log.debug(f"Setting global site font-family to {site_font}"), + font_override_stylesheet.append( + fonts_selectors["site"] + " {font-family:" + site_font + "} " + ) + + # finally append the font overrides stylesheets to the page + soup.head.append(font_override_stylesheet) + def inject_custom_tags(self, section: str, soup, custom_injects: dict): """Inject custom tags to the given section. @@ -667,6 +614,93 @@ class Parser: log.debug(f'Injecting <{section}> tag: {injected_tag}') soup.find(section).append(injected_tag) + def inject_loconotion_script_and_css(self, soup): + # inject loconotion's custom stylesheet and script + loconotion_custom_css = self.cache_file(Path("bundles/loconotion.css")) + custom_css = soup.new_tag( + "link", rel="stylesheet", href=str(loconotion_custom_css) + ) + soup.head.insert(-1, custom_css) + loconotion_custom_js = self.cache_file(Path("bundles/loconotion.js")) + custom_script = soup.new_tag( + "script", type="text/javascript", src=str(loconotion_custom_js) + ) + soup.body.insert(-1, custom_script) + + def find_subpages(self, url, index, soup, hrefDomain): + # find sub-pages and clean slugs / links + subpages = [] + parse_links = not self.get_page_config(url).get("no-links", False) + for a in soup.find_all('a', href=True): + sub_page_href = a["href"] + if sub_page_href.startswith("/"): + sub_page_href = f'{hrefDomain}/{a["href"].split("/")[len(a["href"].split("/"))-1]}' + log.info(f"Got this as href {sub_page_href}") + if sub_page_href.startswith(hrefDomain): + if parse_links or not len(a.find_parents("div", class_="notion-scroller")): + # if the link is an anchor link, + # check if the page hasn't already been parsed + if "#" in sub_page_href: + sub_page_href_tokens = sub_page_href.split("#") + sub_page_href = sub_page_href_tokens[0] + a["href"] = f'#{sub_page_href_tokens[-1]}' + a["class"] = a.get("class", []) + ["loconotion-anchor-link"] + if ( + sub_page_href in self.processed_pages.keys() + or sub_page_href in subpages + ): + log.debug( + f"Original page for anchor link {sub_page_href}" + " already parsed / pending parsing, skipping" + ) + continue + else: + a["href"] = ( + self.get_page_slug(sub_page_href) + if sub_page_href != index + else "index.html" + ) + subpages.append(sub_page_href) + log.debug(f"Found link to page {a['href']}") + else: + # if the page is set not to follow any links, strip the href + # do this only on children of .notion-scroller, we don't want + # to strip the links from the top nav bar + log.debug(f"Stripping link for {a['href']}") + del a["href"] + a.name = "span" + # remove pointer cursor styling on the link and all children + for child in ([a] + a.find_all()): + if (child.has_attr("style")): + style = cssutils.parseStyle(child['style']) + style['cursor'] = "default" + child['style'] = style.cssText + return subpages + + def export_parsed_page(self, url, index, soup): + # exports the parsed page + html_str = str(soup) + html_file = self.get_page_slug(url) if url != index else "index.html" + if html_file in self.processed_pages.values(): + log.error( + f"Found duplicate pages with slug '{html_file}' - previous one will be" + " overwritten. Make sure that your notion pages names or custom slugs" + " in the configuration files are unique" + ) + log.info(f"Exporting page '{url}' as '{html_file}'") + with open(self.dist_folder / html_file, "wb") as f: + f.write(html_str.encode("utf-8").strip()) + self.processed_pages[url] = html_file + + def parse_subpages(self, index, subpages): + # parse sub-pages + if subpages and not self.args.get("single_page", False): + if self.processed_pages: + log.debug(f"Pages processed so far: {len(self.processed_pages)}") + for sub_page in subpages: + if sub_page not in self.processed_pages.keys(): + self.parse_page(sub_page, index=index) + def load(self, url): self.driver.get(url) WebDriverWait(self.driver, 60).until(notion_page_loaded())