diff --git a/.gitignore b/.gitignore index 7c2cda3..d1dc276 100644 --- a/.gitignore +++ b/.gitignore @@ -108,10 +108,7 @@ dmypy.json .pyre/ # End of https://www.gitignore.io/api/python - +.env .vscode -env -dist/* -test/* -logs/* -*.bat \ No newline at end of file +temp +logs \ No newline at end of file diff --git a/README.md b/README.md index e504f1e..57de2c7 100644 --- a/README.md +++ b/README.md @@ -40,11 +40,13 @@ It does, but I wasn't really happy with the styling - the pages looked a bit ugl ## Installation & Requirements -`pip install -r requirements.txt` +Make sure you're in your virtual environment of choiche, then run +- `poetry install --no-dev` if you have [Poetry](https://python-poetry.org/) installed +- `pip install -r requirements.txt` otherwise This script uses [ChromeDriver](chromedriver.chromium.org) to automate the Google Chrome browser - therefore Google Chrome needs to be installed in order to work. -The script comes bundled with the default windows chromedriver executable. On Max / Linux, download the right distribution for you from https://chromedriver.chromium.org/downloads and place the executable in this folder. Alternatively, use the `--chromedriver` argument to specify its path at runtime. +The script will automatically try to download and use the appropriate chromedriver distribution for your OS and Chrome version. If this doesn't work, download the right version for you from https://chromedriver.chromium.org/downloads and use the `--chromedriver` argument to specify its path at runtime. ## Simple Usage diff --git a/loconotion/__main__.py b/loconotion/__main__.py index fc8ccad..a4d5627 100644 --- a/loconotion/__main__.py +++ b/loconotion/__main__.py @@ -8,95 +8,121 @@ from pathlib import Path log = logging.getLogger("loconotion") try: - import requests - import toml + import requests + import toml except ModuleNotFoundError as error: - log.critical(f"ModuleNotFoundError: {error}. have your installed the requirements?") - sys.exit() + log.critical(f"ModuleNotFoundError: {error}. have your installed the requirements?") + sys.exit() from notionparser import Parser + def main(): - # set up argument parser - argparser = argparse.ArgumentParser(description='Generate static websites from Notion.so pages') - argparser.add_argument('target', help='The config file containing the site properties, or the url of the Notion.so page to generate the site from') - argparser.add_argument('--chromedriver', help='Use a specific chromedriver executable instead of the auto-installing one') - argparser.add_argument("--single-page", action="store_true", help="Only parse the first page, then stop") - argparser.add_argument('--clean', action='store_true', help='Delete all previously cached files for the site before generating it') - argparser.add_argument('--non-headless', action='store_true', help='Run chromedriver in non-headless mode') - argparser.add_argument("-v", "--verbose", action="store_true", help="Increasite output log verbosity") - args = argparser.parse_args() + # set up argument parser + argparser = argparse.ArgumentParser( + description="Generate static websites from Notion.so pages" + ) + argparser.add_argument( + "target", + help="The config file containing the site properties, or the url" + " of the Notion.so page to generate the site from", + ) + argparser.add_argument( + "--chromedriver", + help="Use a specific chromedriver executable instead of the auto-installing one", + ) + argparser.add_argument( + "--single-page", action="store_true", help="Only parse the first page, then stop" + ) + argparser.add_argument( + "--clean", + action="store_true", + help="Delete all previously cached files for the site before generating it", + ) + argparser.add_argument( + "--non-headless", + action="store_true", + help="Run chromedriver in non-headless mode", + ) + argparser.add_argument( + "-v", "--verbose", action="store_true", help="Increasite output log verbosity" + ) + args = argparser.parse_args() - # set up some pretty logs - log = logging.getLogger("loconotion") - log.setLevel(logging.INFO if not args.verbose else logging.DEBUG) - log_screen_handler = logging.StreamHandler(stream=sys.stdout) - log.addHandler(log_screen_handler) - log.propagate = False - try: - import colorama, copy - - LOG_COLORS = { - logging.DEBUG: colorama.Fore.GREEN, - logging.INFO: colorama.Fore.BLUE, - logging.WARNING: colorama.Fore.YELLOW, - logging.ERROR: colorama.Fore.RED, - logging.CRITICAL: colorama.Back.RED - } - - class ColorFormatter(logging.Formatter): - def format(self, record, *args, **kwargs): - # if the corresponding logger has children, they may receive modified - # record, so we want to keep it intact - new_record = copy.copy(record) - if new_record.levelno in LOG_COLORS: - new_record.levelname = "{color_begin}{level}{color_end}".format( - level=new_record.levelname, - color_begin=LOG_COLORS[new_record.levelno], - color_end=colorama.Style.RESET_ALL, - ) - return super(ColorFormatter, self).format(new_record, *args, **kwargs) - - log_screen_handler.setFormatter(ColorFormatter(fmt='%(asctime)s %(levelname)-8s %(message)s', - datefmt="{color_begin}[%H:%M:%S]{color_end}".format( - color_begin=colorama.Style.DIM, - color_end=colorama.Style.RESET_ALL - ))) - except ModuleNotFoundError as identifier: - pass - - # initialise and run the website parser - try: - if urllib.parse.urlparse(args.target).scheme: - try: - response = requests.get(args.target) - if ("notion.so" in args.target): - log.info("Initialising parser with simple page url") - config = { "page" : args.target } - Parser(config = config, args = vars(args)) - else: - log.critical(f"{args.target} is not a notion.so page") - except requests.ConnectionError as exception: - log.critical(f"Connection error") - else: - if Path(args.target).is_file(): - with open(args.target) as f: - parsed_config = toml.loads(f.read()) - log.info(f"Initialising parser with configuration file") - log.debug(parsed_config) - Parser(config = parsed_config, args = vars(args)) - else: - log.critical(f"Config file {args.target} does not exists") - except FileNotFoundError as e: - log.critical(f'FileNotFoundError: {e}') - sys.exit(0) - -if __name__ == '__main__': - try: - main() - except KeyboardInterrupt: - log.critical('Interrupted by user') + # set up some pretty logs + log = logging.getLogger("loconotion") + log.setLevel(logging.INFO if not args.verbose else logging.DEBUG) + log_screen_handler = logging.StreamHandler(stream=sys.stdout) + log.addHandler(log_screen_handler) + log.propagate = False try: - sys.exit(0) - except SystemExit: - os._exit(0) \ No newline at end of file + import colorama, copy + + LOG_COLORS = { + logging.DEBUG: colorama.Fore.GREEN, + logging.INFO: colorama.Fore.BLUE, + logging.WARNING: colorama.Fore.YELLOW, + logging.ERROR: colorama.Fore.RED, + logging.CRITICAL: colorama.Back.RED, + } + + class ColorFormatter(logging.Formatter): + def format(self, record, *args, **kwargs): + # if the corresponding logger has children, they may receive modified + # record, so we want to keep it intact + new_record = copy.copy(record) + if new_record.levelno in LOG_COLORS: + new_record.levelname = "{color_begin}{level}{color_end}".format( + level=new_record.levelname, + color_begin=LOG_COLORS[new_record.levelno], + color_end=colorama.Style.RESET_ALL, + ) + return super(ColorFormatter, self).format(new_record, *args, **kwargs) + + log_screen_handler.setFormatter( + ColorFormatter( + fmt="%(asctime)s %(levelname)-8s %(message)s", + datefmt="{color_begin}[%H:%M:%S]{color_end}".format( + color_begin=colorama.Style.DIM, color_end=colorama.Style.RESET_ALL + ), + ) + ) + except ModuleNotFoundError as identifier: + pass + + # initialise and run the website parser + try: + if urllib.parse.urlparse(args.target).scheme: + try: + response = requests.get(args.target) + if "notion.so" in args.target: + log.info("Initialising parser with simple page url") + config = {"page": args.target} + Parser(config=config, args=vars(args)) + else: + log.critical(f"{args.target} is not a notion.so page") + except requests.ConnectionError as exception: + log.critical(f"Connection error") + else: + if Path(args.target).is_file(): + with open(args.target) as f: + parsed_config = toml.loads(f.read()) + log.info(f"Initialising parser with configuration file") + log.debug(parsed_config) + Parser(config=parsed_config, args=vars(args)) + else: + log.critical(f"Config file {args.target} does not exists") + except FileNotFoundError as e: + log.critical(f"FileNotFoundError: {e}") + sys.exit(0) + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + log.critical("Interrupted by user") + try: + sys.exit(0) + except SystemExit: + os._exit(0) diff --git a/loconotion/conditions.py b/loconotion/conditions.py index 78c5f0c..ac81722 100644 --- a/loconotion/conditions.py +++ b/loconotion/conditions.py @@ -2,40 +2,54 @@ import logging log = logging.getLogger(f"loconotion.{__name__}") -class notion_page_loaded(object): - """An expectation for checking that a notion page has loaded. - """ - def __init__(self, url): - self.url = url - def __call__(self, driver): - notion_presence = len(driver.find_elements_by_class_name("notion-presence-container")) - collection_view_block = len(driver.find_elements_by_class_name("notion-collection_view_page-block")); - collection_search = len(driver.find_elements_by_class_name("collectionSearch")); - # embed_ghosts = len(driver.find_elements_by_css_selector("div[embed-ghost]")); - log.debug(f"Waiting for page content to load (presence container: {notion_presence}, loaders: {loading_spinners} )") - if (notion_presence and not loading_spinners): - return True - else: - return False +class notion_page_loaded(object): + """An expectation for checking that a notion page has loaded.""" + + def __init__(self, url): + self.url = url + + def __call__(self, driver): + notion_presence = len( + driver.find_elements_by_class_name("notion-presence-container") + ) + collection_view_block = len( + driver.find_elements_by_class_name("notion-collection_view_page-block") + ) + collection_search = len(driver.find_elements_by_class_name("collectionSearch")) + # embed_ghosts = len(driver.find_elements_by_css_selector("div[embed-ghost]")); + log.debug( + f"Waiting for page content to load" + f" (presence container: {notion_presence}, loaders: {loading_spinners} )" + ) + if notion_presence and not loading_spinners: + return True + else: + return False + class toggle_block_has_opened(object): - """An expectation for checking that a notion toggle block has been opened. + """An expectation for checking that a notion toggle block has been opened. It does so by checking if the div hosting the content has enough children, - and the abscence of the loading spinner. - """ - def __init__(self, toggle_block): - self.toggle_block = toggle_block + and the abscence of the loading spinner.""" - def __call__(self, driver): - toggle_content = self.toggle_block.find_element_by_css_selector("div:not([style]") - if (toggle_content): - content_children = len(toggle_content.find_elements_by_tag_name("div")) - is_loading = len(self.toggle_block.find_elements_by_class_name("loading-spinner")); - log.debug(f"Waiting for toggle block to load ({content_children} children so far and {is_loading} loaders)") - if (content_children > 3 and not is_loading): - return True - else: - return False - else: - return False \ No newline at end of file + def __init__(self, toggle_block): + self.toggle_block = toggle_block + + def __call__(self, driver): + toggle_content = self.toggle_block.find_element_by_css_selector("div:not([style]") + if toggle_content: + content_children = len(toggle_content.find_elements_by_tag_name("div")) + is_loading = len( + self.toggle_block.find_elements_by_class_name("loading-spinner") + ) + log.debug( + f"Waiting for toggle block to load" + f" ({content_children} children so far and {is_loading} loaders)" + ) + if content_children > 3 and not is_loading: + return True + else: + return False + else: + return False diff --git a/loconotion/notionparser.py b/loconotion/notionparser.py index d7cbca0..5de4c71 100644 --- a/loconotion/notionparser.py +++ b/loconotion/notionparser.py @@ -14,478 +14,602 @@ from pathlib import Path log = logging.getLogger(f"loconotion.{__name__}") try: - import chromedriver_autoinstaller - from selenium import webdriver - from selenium.webdriver.chrome.options import Options - from selenium.common.exceptions import TimeoutException, NoSuchElementException - from selenium.webdriver.support import expected_conditions as EC - from selenium.webdriver.common.by import By - from selenium.webdriver.common.action_chains import ActionChains - from selenium.webdriver.support.ui import WebDriverWait - from bs4 import BeautifulSoup - import requests - import cssutils - cssutils.log.setLevel(logging.CRITICAL) # removes warning logs from cssutils + import chromedriver_autoinstaller + from selenium import webdriver + from selenium.webdriver.chrome.options import Options + from selenium.common.exceptions import TimeoutException, NoSuchElementException + from selenium.webdriver.support import expected_conditions as EC + from selenium.webdriver.common.by import By + from selenium.webdriver.common.action_chains import ActionChains + from selenium.webdriver.support.ui import WebDriverWait + from bs4 import BeautifulSoup + import requests + import cssutils + + cssutils.log.setLevel(logging.CRITICAL) # removes warning logs from cssutils except ModuleNotFoundError as error: - log.critical(f"ModuleNotFoundError: {error}. have your installed the requirements?") - sys.exit() + log.critical(f"ModuleNotFoundError: {error}. have your installed the requirements?") + sys.exit() from conditions import toggle_block_has_opened -class Parser(): - def __init__(self, config = {}, args = {}): - self.config = config - self.args = args - url = self.config.get("page", None) - if not url: - log.critical("No initial page url specified. If passing a configuration file," + - "make sure it contains a 'page' key with the url of the notion.so page to parse") - return - # get the site name from the config, or make it up by cleaning the target page's slug - site_name = self.config.get("name", self.get_page_slug(url, extension = False)) +class Parser: + def __init__(self, config={}, args={}): + self.config = config + self.args = args + url = self.config.get("page", None) + if not url: + log.critical( + "No initial page url specified. If passing a configuration file," + " make sure it contains a 'page' key with the url of the notion.so" + " page to parse" + ) + return - # set the output folder based on the site name - self.dist_folder = Path(config.get("output", Path("dist") / site_name)) - log.info(f"Setting output path to '{self.dist_folder}'") + # get the site name from the config, or make it up by cleaning the target page's slug + site_name = self.config.get("name", self.get_page_slug(url, extension=False)) - # check if the argument to clean the dist folder was passed - if (self.args.get("clean", False)): - try: - shutil.rmtree(self.dist_folder) - log.info(f"Removing previously cached files in '{self.dist_folder}'") - except OSError as e: - log.error(f"Cannot remove '{self.dist_folder}': {e}") + # set the output folder based on the site name + self.dist_folder = Path(config.get("output", Path("dist") / site_name)) + log.info(f"Setting output path to '{self.dist_folder}'") - # create the output folder if necessary - self.dist_folder.mkdir(parents=True, exist_ok=True) - - # initialize chromedriver and start parsing - self.driver = self.init_chromedriver() - self.run(url) - - def get_page_config(self, token): - # starts by grabbing the gobal site configuration table, if exists - site_config = self.config.get("site", {}) - - # check if there's anything wrong with the site config - if (site_config.get("slug", None)): - log.error("'slug' parameter has no effect in the [site] table, and should only present in page tables.") - del site_config['slug'] - - # find a table in the configuration file whose key contains the passed token string - site_pages_config = self.config.get("pages", {}) - matching_pages_config = [value for key, value in site_pages_config.items() if key.lower() in token] - if (matching_pages_config): - if (len(matching_pages_config) > 1): - log.error(f"multiple matching page config tokens found for {token} in configuration file. Make sure pages urls / slugs are unique") - return site_config - else: - # if found, merge it on top of the global site configuration table - # log.debug(f"Config table found for page with token {token}") - matching_page_config = matching_pages_config[0] - if (type(matching_page_config) is dict): - return {**site_config, **matching_page_config} - else: - log.error(f"Matching page configuration for {url} was not a dict: {matching_page_config} - something went wrong") - return site_config - else: - # log.debug(f"No config table found for page token {token}, using global site config table") - return site_config - - def get_page_slug(self, url, extension = True): - # first check if the url has a custom slug configured in the config file - custom_slug = self.get_page_config(url).get("slug", None) - if custom_slug: - log.debug(f"Custom slug found for url '{url}': '{custom_slug}'") - return custom_slug.strip("/") + (".html" if extension else "") - else: - # if not, clean up the existing slug - path = urllib.parse.urlparse(url).path.strip("/") - if ("-" in path and len(path.split("-")) > 1): - # a standard notion page looks like the-page-title-[uiid] - # strip the uuid and keep the page title only - path = "-".join(path.split("-")[:-1]).lower() - elif ("?" in path): - # database pages just have an uiid and a query param - # not much to do here, just get rid of the query param - path = path.split("?")[0].lower() - return path + (".html" if extension else "") - - def cache_file(self, url, filename = None): - # stringify the url in case it's a Path object - url = str(url) - - # if no filename specificed, generate an hashed id based the query-less url, - # so we avoid re-downloading / caching files we already have - if (not filename): - parsed_url = urllib.parse.urlparse(url) - queryless_url = parsed_url.netloc + parsed_url.path - query_params = urllib.parse.parse_qs(parsed_url.query) - # if any of the query params contains a size parameters store it in the has - # so we can download other higher-resolution versions if needed - if ("width" in query_params.keys()): - queryless_url = queryless_url + f"?width={query_params['width']}" - filename = hashlib.sha1(str.encode(queryless_url)).hexdigest(); - destination = self.dist_folder / filename - - # check if there are any files matching the filename, ignoring extension - matching_file = glob.glob(str(destination.with_suffix('.*'))) - if not matching_file: - # if url has a network scheme, download the file - if "http" in urllib.parse.urlparse(url).scheme: - try: - # Disabling proxy speeds up requests time - # https://stackoverflow.com/questions/45783655/first-https-request-takes-much-more-time-than-the-rest - # https://stackoverflow.com/questions/28521535/requests-how-to-disable-bypass-proxy - session = requests.Session() - session.trust_env = False - log.info(f"Downloading '{url}'") - response = session.get(url) - - # if the filename does not have an extension at this point, - # try to infer it from the url, and if not possible, - # from the content-type header mimetype - if (not destination.suffix): - file_extension = Path(urllib.parse.urlparse(url).path).suffix - if (not file_extension): - content_type = response.headers.get('content-type') - if (content_type): - file_extension = mimetypes.guess_extension(content_type) - destination = destination.with_suffix(file_extension) - - Path(destination).parent.mkdir(parents=True, exist_ok=True) - with open(destination, "wb") as f: - f.write(response.content) - - return destination.relative_to(self.dist_folder) - except Exception as error: - log.error(f"Error downloading file '{url}': {error}") - return url - # if not, check if it's a local file, and copy it to the dist folder - else: - if Path(url).is_file(): - log.debug(f"Caching local file '{url}'") - destination = destination.with_suffix(Path(url).suffix) - shutil.copyfile(url, destination) - return destination.relative_to(self.dist_folder) - # if we already have a matching cached file, just return its relative path - else: - cached_file = Path(matching_file[0]).relative_to(self.dist_folder) - log.debug(f"'{url}' was already downloaded") - return cached_file - - def init_chromedriver(self): - chromedriver_path = self.args.get("chromedriver") - if (not chromedriver_path): - try: - chromedriver_path = chromedriver_autoinstaller.install() - except Exception as exception: - log.critical(f"Failed to install the built-in chromedriver: {exception}\n" + - "download the correct version for your system at https://chromedriver.chromium.org/downloads" + - "and use the --chromedriver argument to point to the chromedriver executable") - sys.exit() - - log.info(f"Initialising chromedriver at {chromedriver_path}") - logs_path = (Path.cwd() / "logs" / "webdrive.log") - logs_path.parent.mkdir(parents=True, exist_ok=True) - - chrome_options = Options() - if (not self.args.get("non_headless", False)): - chrome_options.add_argument("--headless") - chrome_options.add_argument("window-size=1920,1080") - chrome_options.add_argument("--log-level=3"); - chrome_options.add_argument("--silent"); - chrome_options.add_argument("--disable-logging") - # removes the 'DevTools listening' log message - chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) - return webdriver.Chrome( - executable_path=str(chromedriver_path), - service_log_path=str(logs_path), - options=chrome_options) - - def parse_page(self, url, processed_pages = {}, index = None): - # if this is the first page being parse, set it as the index.html - if (not index): - index = url; - - log.info(f"Parsing page '{url}'") - log.debug(f"Using page config: {self.get_page_config(url)}") - self.driver.get(url) - - # if ("This content does not exist" in self.driver.page_source): - # log.error(f"No content found in {url}. Are you sure the page is set to public?") - # return - - try: - # WebDriverWait(self.driver, 10).until(notion_page_loaded()) - WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'notion-presence-container'))) - except TimeoutException as ex: - log.critical("Timeout waiting for page content to load, or no content found. Are you sure the page is set to public?") - return - - # cooldown to allow eventual database items to load - # TODO: figure out a way to detect they loaded - time.sleep(2) - - # function to expand all the toggle block in the page to make their content visible - # so we can hook up our custom toggle logic afterwards - def open_toggle_blocks(exclude = []): - opened_toggles = exclude; - toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block") - log.debug(f"Opening {len(toggle_blocks)} new toggle blocks in the page") - for toggle_block in toggle_blocks: - if (not toggle_block in opened_toggles): - toggle_button = toggle_block.find_element_by_css_selector("div[role=button]") - # check if the toggle is already open by the direction of its arrow - is_toggled = "(180deg)" in (toggle_button.find_element_by_tag_name("svg").get_attribute("style")) - if (not is_toggled): - # click on it, then wait until all elements are displayed - toggle_button.click() + # check if the argument to clean the dist folder was passed + if self.args.get("clean", False): try: - WebDriverWait(self.driver, 10).until(toggle_block_has_opened(toggle_block)) - except TimeoutException as ex: - log.warning("Timeout waiting for toggle block to open. Likely it's already open, but doesn't hurt to check.") - except Exception as ex: - log.error("Something went wrong with selenium while trying to open a toggle block") - opened_toggles.append(toggle_block) - # after all toggles have been opened, check the page again to see if - # any toggle block had nested toggle blocks inside them - new_toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block") - if (len(new_toggle_blocks) > len(toggle_blocks)): - # if so, run the function again - open_toggle_blocks(opened_toggles) - # open the toggle blocks in the page - open_toggle_blocks() + shutil.rmtree(self.dist_folder) + log.info(f"Removing previously cached files in '{self.dist_folder}'") + except OSError as e: + log.error(f"Cannot remove '{self.dist_folder}': {e}") - # creates soup from the page to start parsing - soup = BeautifulSoup(self.driver.page_source, "html.parser") + # create the output folder if necessary + self.dist_folder.mkdir(parents=True, exist_ok=True) + # initialize chromedriver and start parsing + self.driver = self.init_chromedriver() + self.run(url) - # remove scripts and other tags we don't want / need - for unwanted in soup.findAll('script'): - unwanted.decompose(); - for intercom_frame in soup.findAll('div',{'id':'intercom-frame'}): - intercom_frame.decompose(); - for intercom_div in soup.findAll('div',{'class':'intercom-lightweight-app'}): - intercom_div.decompose(); - for overlay_div in soup.findAll('div',{'class':'notion-overlay-container'}): - overlay_div.decompose(); - for vendors_css in soup.find_all("link", href=lambda x: x and 'vendors~' in x): - vendors_css.decompose(); + def get_page_config(self, token): + # starts by grabbing the gobal site configuration table, if exists + site_config = self.config.get("site", {}) + # check if there's anything wrong with the site config + if site_config.get("slug", None): + log.error( + "'slug' parameter has no effect in the [site] table, " + "and should only present in page tables." + ) + del site_config["slug"] - # clean up the default notion meta tags - for tag in ["description", "twitter:card", "twitter:site", "twitter:title", "twitter:description", "twitter:image", "twitter:url", "apple-itunes-app"]: - unwanted_tag = soup.find("meta", attrs = { "name" : tag}) - if (unwanted_tag): unwanted_tag.decompose(); - for tag in ["og:site_name", "og:type", "og:url", "og:title", "og:description", "og:image"]: - unwanted_og_tag = soup.find("meta", attrs = { "property" : tag}) - if (unwanted_og_tag): unwanted_og_tag.decompose(); - - - # set custom meta tags - custom_meta_tags = self.get_page_config(url).get("meta", []) - for custom_meta_tag in custom_meta_tags: - tag = soup.new_tag('meta') - for attr, value in custom_meta_tag.items(): - tag.attrs[attr] = value - log.debug(f"Adding meta tag {str(tag)}") - soup.head.append(tag) - - - # process images - cache_images = True - for img in soup.findAll('img'): - if img.has_attr('src'): - if (cache_images and not 'data:image' in img['src']): - img_src = img['src'] - # if the path starts with /, it's one of notion's predefined images - if (img['src'].startswith('/')): - img_src = "https://www.notion.so" + img['src'] - # notion's own default images urls are in a weird format, need to sanitize them - # img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0] - # if (not '.amazonaws' in img_src): - # img_src = urllib.parse.unquote(img_src) - - cached_image = self.cache_file(img_src) - img['src'] = cached_image + # find a table in the configuration file whose key contains the passed token string + site_pages_config = self.config.get("pages", {}) + matching_pages_config = [ + value for key, value in site_pages_config.items() if key.lower() in token + ] + if matching_pages_config: + if len(matching_pages_config) > 1: + log.error( + f"multiple matching page config tokens found for {token}" + " in configuration file. Make sure pages urls / slugs are unique" + ) + return site_config + else: + # if found, merge it on top of the global site configuration table + # log.debug(f"Config table found for page with token {token}") + matching_page_config = matching_pages_config[0] + if type(matching_page_config) is dict: + return {**site_config, **matching_page_config} + else: + log.error( + f"Matching page configuration for {url} was not a dict:" + f" {matching_page_config} - something went wrong" + ) + return site_config else: - if (img['src'].startswith('/')): - img['src'] = "https://www.notion.so" + img['src'] + # log.debug(f"No config table found for page token {token}, using global site config table") + return site_config + def get_page_slug(self, url, extension=True): + # first check if the url has a custom slug configured in the config file + custom_slug = self.get_page_config(url).get("slug", None) + if custom_slug: + log.debug(f"Custom slug found for url '{url}': '{custom_slug}'") + return custom_slug.strip("/") + (".html" if extension else "") + else: + # if not, clean up the existing slug + path = urllib.parse.urlparse(url).path.strip("/") + if "-" in path and len(path.split("-")) > 1: + # a standard notion page looks like the-page-title-[uiid] + # strip the uuid and keep the page title only + path = "-".join(path.split("-")[:-1]).lower() + elif "?" in path: + # database pages just have an uiid and a query param + # not much to do here, just get rid of the query param + path = path.split("?")[0].lower() + return path + (".html" if extension else "") - # process stylesheets - for link in soup.findAll('link', rel="stylesheet"): - if link.has_attr('href') and link['href'].startswith('/'): - # we don't need the vendors stylesheet - if ("vendors~" in link['href']): - continue - # css_file = link['href'].strip("/") - cached_css_file = self.cache_file('https://www.notion.so' + link['href']) - with open(self.dist_folder / cached_css_file, 'rb') as f: - stylesheet = cssutils.parseString(f.read()) - # open the stylesheet and check for any font-face rule, - for rule in stylesheet.cssRules: - if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE: - # if any are found, download the font file - font_file = rule.style['src'].split("url(/")[-1].split(") format")[0] - cached_font_file = self.cache_file(f'https://www.notion.so/{font_file}') - rule.style['src'] = f"url({str(cached_font_file)})" - link['href'] = str(cached_css_file) + def cache_file(self, url, filename=None): + # stringify the url in case it's a Path object + url = str(url) + # if no filename specificed, generate an hashed id based the query-less url, + # so we avoid re-downloading / caching files we already have + if not filename: + parsed_url = urllib.parse.urlparse(url) + queryless_url = parsed_url.netloc + parsed_url.path + query_params = urllib.parse.parse_qs(parsed_url.query) + # if any of the query params contains a size parameters store it in the has + # so we can download other higher-resolution versions if needed + if "width" in query_params.keys(): + queryless_url = queryless_url + f"?width={query_params['width']}" + filename = hashlib.sha1(str.encode(queryless_url)).hexdigest() + destination = self.dist_folder / filename - # add our custom logic to all toggle blocks - for toggle_block in soup.findAll('div',{'class':'notion-toggle-block'}): - toggle_id = uuid.uuid4() - toggle_button = toggle_block.select_one('div[role=button]') - toggle_content = toggle_block.find('div', {'class': None, 'style': ''}) - if (toggle_button and toggle_content): - # add a custom class to the toggle button and content, plus a custom attribute - # sharing a unique uiid so we can hook them up with some custom js logic later - toggle_button['class'] = toggle_block.get('class', []) + ['loconotion-toggle-button'] - toggle_content['class'] = toggle_content.get('class', []) + ['loconotion-toggle-content'] - toggle_content.attrs['loconotion-toggle-id'] = toggle_button.attrs['loconotion-toggle-id'] = toggle_id + # check if there are any files matching the filename, ignoring extension + matching_file = glob.glob(str(destination.with_suffix(".*"))) + if not matching_file: + # if url has a network scheme, download the file + if "http" in urllib.parse.urlparse(url).scheme: + try: + # Disabling proxy speeds up requests time + # https://stackoverflow.com/questions/45783655/first-https-request-takes-much-more-time-than-the-rest + # https://stackoverflow.com/questions/28521535/requests-how-to-disable-bypass-proxy + session = requests.Session() + session.trust_env = False + log.info(f"Downloading '{url}'") + response = session.get(url) + # if the filename does not have an extension at this point, + # try to infer it from the url, and if not possible, + # from the content-type header mimetype + if not destination.suffix: + file_extension = Path(urllib.parse.urlparse(url).path).suffix + if not file_extension: + content_type = response.headers.get("content-type") + if content_type: + file_extension = mimetypes.guess_extension(content_type) + destination = destination.with_suffix(file_extension) + + Path(destination).parent.mkdir(parents=True, exist_ok=True) + with open(destination, "wb") as f: + f.write(response.content) + + return destination.relative_to(self.dist_folder) + except Exception as error: + log.error(f"Error downloading file '{url}': {error}") + return url + # if not, check if it's a local file, and copy it to the dist folder + else: + if Path(url).is_file(): + log.debug(f"Caching local file '{url}'") + destination = destination.with_suffix(Path(url).suffix) + shutil.copyfile(url, destination) + return destination.relative_to(self.dist_folder) + # if we already have a matching cached file, just return its relative path + else: + cached_file = Path(matching_file[0]).relative_to(self.dist_folder) + log.debug(f"'{url}' was already downloaded") + return cached_file + + def init_chromedriver(self): + chromedriver_path = self.args.get("chromedriver") + if not chromedriver_path: + try: + chromedriver_path = chromedriver_autoinstaller.install() + except Exception as exception: + log.critical( + f"Failed to install the built-in chromedriver: {exception}\n" + "\nDownload the correct version for your system at" + " https://chromedriver.chromium.org/downloads and use the" + " --chromedriver argument to point to the chromedriver executable" + ) + sys.exit() + + log.info(f"Initialising chromedriver at {chromedriver_path}") + logs_path = Path.cwd() / "logs" / "webdrive.log" + logs_path.parent.mkdir(parents=True, exist_ok=True) + + chrome_options = Options() + if not self.args.get("non_headless", False): + chrome_options.add_argument("--headless") + chrome_options.add_argument("window-size=1920,1080") + chrome_options.add_argument("--log-level=3") + chrome_options.add_argument("--silent") + chrome_options.add_argument("--disable-logging") + # removes the 'DevTools listening' log message + chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"]) + return webdriver.Chrome( + executable_path=str(chromedriver_path), + service_log_path=str(logs_path), + options=chrome_options, + ) + + def parse_page(self, url, processed_pages={}, index=None): + # if this is the first page being parse, set it as the index.html + if not index: + index = url + + log.info(f"Parsing page '{url}'") + log.debug(f"Using page config: {self.get_page_config(url)}") + self.driver.get(url) + + # if "This content does not exist" in self.driver.page_source: + # log.error( + # f"No content found in {url}." + # " Are you sure the page is set to public?" + # ) + # return - # if there are any table views in the page, add links to the title rows - for table_view in soup.findAll('div', {'class':'notion-table-view'}): - for table_row in table_view.findAll('div', {'class':'notion-collection-item'}): - # for each row, hover the mouse over it to make the open button appear, - # then grab its href and wrap the table row's name into a link - table_row_block_id = table_row['data-block-id'] - table_row_hover_target = self.driver.find_element_by_css_selector(f"div[data-block-id='{table_row_block_id}'] > div > div") - # need to scroll the row into view or else the open button won't visible to selenium - self.driver.execute_script("arguments[0].scrollIntoView();", table_row_hover_target) - ActionChains(self.driver).move_to_element(table_row_hover_target).perform() try: - WebDriverWait(self.driver, 5).until(EC.visibility_of_element_located( - (By.CSS_SELECTOR, f"div[data-block-id='{table_row_block_id}'] > div > a"))) + # WebDriverWait(self.driver, 10).until(notion_page_loaded()) + WebDriverWait(self.driver, 10).until( + EC.presence_of_element_located( + (By.CLASS_NAME, "notion-presence-container") + ) + ) except TimeoutException as ex: - log.error(f"Timeout waiting for the 'open' button for row in table with block id {table_row_block_id}") - table_row_href = self.driver.find_element_by_css_selector(f"div[data-block-id='{table_row_block_id}'] > div > a").get_attribute('href') - table_row_href = table_row_href.split("notion.so")[-1] - row_target_span = table_row.find("span") - row_link_wrapper = soup.new_tag('a', attrs={'href': table_row_href, 'style':"cursor: pointer;"}) - row_target_span.wrap(row_link_wrapper) + log.critical( + "Timeout waiting for page content to load, or no content found." + " Are you sure the page is set to public?" + ) + return + # cooldown to allow eventual database items to load + # TODO: figure out a way to detect they loaded + time.sleep(2) - # embed custom google font(s) - fonts_selectors = { - "site" : "div:not(.notion-code-block)", - "navbar": ".notion-topbar div", - "title" : ".notion-page-block > div, .notion-collection_view_page-block > div", - "h1" : ".notion-header-block div, notion-page-content > notion-collection_view-block > div:first-child div", - "h2" : ".notion-sub_header-block div", - "h3" : ".notion-sub_sub_header-block div", - "body" : ".notion-app-inner", - "code" : ".notion-code-block *", - } - custom_fonts = self.get_page_config(url).get("fonts", {}) - if (custom_fonts): - # append a stylesheet importing the google font for each unique font - unique_custom_fonts = set(custom_fonts.values()) - for font in unique_custom_fonts: - if (font): - google_fonts_embed_name = font.replace(" ", "+") - font_href = f"https://fonts.googleapis.com/css2?family={google_fonts_embed_name}:wght@500;600;700&display=swap" - custom_font_stylesheet = soup.new_tag("link", rel="stylesheet", href=font_href) - soup.head.append(custom_font_stylesheet); + # function to expand all the toggle block in the page to make their content visible + # so we can hook up our custom toggle logic afterwards + def open_toggle_blocks(exclude=[]): + opened_toggles = exclude + toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block") + log.debug(f"Opening {len(toggle_blocks)} new toggle blocks in the page") + for toggle_block in toggle_blocks: + if not toggle_block in opened_toggles: + toggle_button = toggle_block.find_element_by_css_selector( + "div[role=button]" + ) + # check if the toggle is already open by the direction of its arrow + is_toggled = "(180deg)" in ( + toggle_button.find_element_by_tag_name("svg").get_attribute( + "style" + ) + ) + if not is_toggled: + # click on it, then wait until all elements are displayed + toggle_button.click() + try: + WebDriverWait(self.driver, 10).until( + toggle_block_has_opened(toggle_block) + ) + except TimeoutException as ex: + log.warning( + "Timeout waiting for toggle block to open." + " Likely it's already open, but doesn't hurt to check." + ) + except Exception as ex: + log.error( + "Something went wrong while trying to open a toggle block" + ) + opened_toggles.append(toggle_block) + # after all toggles have been opened, check the page again to see if + # any toggle block had nested toggle blocks inside them + new_toggle_blocks = self.driver.find_elements_by_class_name( + "notion-toggle-block" + ) + if len(new_toggle_blocks) > len(toggle_blocks): + # if so, run the function again + open_toggle_blocks(opened_toggles) - # go through each custom font, and add a css rule overriding the font-family - # to the font override stylesheet targetting the appropriate selector - font_override_stylesheet = soup.new_tag('style', type='text/css') - for target, custom_font in custom_fonts.items(): - if custom_font and not target == "site": - log.debug(f"Setting {target} font-family to {custom_font}") - font_override_stylesheet.append(fonts_selectors[target] + " {font-family:" + custom_font + " !important} ") - site_font = custom_fonts.get("site", None) - # process global site font last to more granular settings can override it - if (site_font): - log.debug(f"Setting global site font-family to {site_font}"), - font_override_stylesheet.append(fonts_selectors["site"] + " {font-family:" + site_font + "} ") - # finally append the font overrides stylesheets to the page - soup.head.append(font_override_stylesheet) + # open the toggle blocks in the page + open_toggle_blocks() + # creates soup from the page to start parsing + soup = BeautifulSoup(self.driver.page_source, "html.parser") - # inject any custom elements to the page - custom_injects = self.get_page_config(url).get("inject", {}) - def injects_custom_tags(section): - section_custom_injects = custom_injects.get(section, {}) - for tag, elements in section_custom_injects.items(): - for element in elements: - injected_tag = soup.new_tag(tag) - for attr, value in element.items(): - injected_tag[attr] = value - # if the value refers to a file, copy it to the dist folder - if (attr.lower() == "href" or attr.lower() == "src"): - log.debug(f"Copying injected file '{value}'") - cached_custom_file = self.cache_file((Path.cwd() / value.strip("/"))) - # destination = (self.dist_folder / source.name) - # shutil.copyfile(source, destination) - injected_tag[attr] = str(cached_custom_file) #source.name - log.debug(f"Injecting <{section}> tag: {str(injected_tag)}") - soup.find(section).append(injected_tag) - injects_custom_tags("head") - injects_custom_tags("body") + # remove scripts and other tags we don't want / need + for unwanted in soup.findAll("script"): + unwanted.decompose() + for intercom_frame in soup.findAll("div", {"id": "intercom-frame"}): + intercom_frame.decompose() + for intercom_div in soup.findAll("div", {"class": "intercom-lightweight-app"}): + intercom_div.decompose() + for overlay_div in soup.findAll("div", {"class": "notion-overlay-container"}): + overlay_div.decompose() + for vendors_css in soup.find_all("link", href=lambda x: x and "vendors~" in x): + vendors_css.decompose() + # clean up the default notion meta tags + for tag in [ + "description", + "twitter:card", + "twitter:site", + "twitter:title", + "twitter:description", + "twitter:image", + "twitter:url", + "apple-itunes-app", + ]: + unwanted_tag = soup.find("meta", attrs={"name": tag}) + if unwanted_tag: + unwanted_tag.decompose() + for tag in [ + "og:site_name", + "og:type", + "og:url", + "og:title", + "og:description", + "og:image", + ]: + unwanted_og_tag = soup.find("meta", attrs={"property": tag}) + if unwanted_og_tag: + unwanted_og_tag.decompose() - # inject loconotion's custom stylesheet and script - loconotion_custom_css = self.cache_file(Path("bundles/loconotion.css")) - custom_css = soup.new_tag("link", rel="stylesheet", href=str(loconotion_custom_css)) - soup.head.insert(-1, custom_css) - loconotion_custom_js = self.cache_file(Path("bundles/loconotion.js")) - custom_script = soup.new_tag("script", type="text/javascript", src=str(loconotion_custom_js)) - soup.body.insert(-1, custom_script) + # set custom meta tags + custom_meta_tags = self.get_page_config(url).get("meta", []) + for custom_meta_tag in custom_meta_tags: + tag = soup.new_tag("meta") + for attr, value in custom_meta_tag.items(): + tag.attrs[attr] = value + log.debug(f"Adding meta tag {str(tag)}") + soup.head.append(tag) + # process images + cache_images = True + for img in soup.findAll("img"): + if img.has_attr("src"): + if cache_images and not "data:image" in img["src"]: + img_src = img["src"] + # if the path starts with /, it's one of notion's predefined images + if img["src"].startswith("/"): + img_src = "https://www.notion.so" + img["src"] + # notion's own default images urls are in a weird format, need to sanitize them + # img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0] + # if (not '.amazonaws' in img_src): + # img_src = urllib.parse.unquote(img_src) - # find sub-pages and clean slugs / links - sub_pages = []; - for a in soup.findAll('a'): - if a['href'].startswith('/'): - sub_page_href = 'https://www.notion.so' + a['href'] - # if the link is an anchor link, check if the page hasn't already been parsed - if ("#" in sub_page_href): - sub_page_href_tokens = sub_page_href.split("#") - sub_page_href = sub_page_href_tokens[0] - a['href'] = "#" + sub_page_href_tokens[-1] - a['class'] = a.get('class', []) + ['loconotion-anchor-link'] - if (sub_page_href in processed_pages.keys() or sub_page_href in sub_pages): - log.debug(f"Original page for anchor link {sub_page_href} already parsed / pending parsing, skipping") - continue - else: - a['href'] = self.get_page_slug(sub_page_href) if sub_page_href != index else "index.html" - sub_pages.append(sub_page_href) - log.debug(f"Found link to page {a['href']}") + cached_image = self.cache_file(img_src) + img["src"] = cached_image + else: + if img["src"].startswith("/"): + img["src"] = "https://www.notion.so" + img["src"] + # process stylesheets + for link in soup.findAll("link", rel="stylesheet"): + if link.has_attr("href") and link["href"].startswith("/"): + # we don't need the vendors stylesheet + if "vendors~" in link["href"]: + continue + # css_file = link['href'].strip("/") + cached_css_file = self.cache_file("https://www.notion.so" + link["href"]) + with open(self.dist_folder / cached_css_file, "rb") as f: + stylesheet = cssutils.parseString(f.read()) + # open the stylesheet and check for any font-face rule, + for rule in stylesheet.cssRules: + if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE: + # if any are found, download the font file + font_file = ( + rule.style["src"].split("url(/")[-1].split(") format")[0] + ) + cached_font_file = self.cache_file( + f"https://www.notion.so/{font_file}" + ) + rule.style["src"] = f"url({str(cached_font_file)})" + link["href"] = str(cached_css_file) - # exports the parsed page - html_str = str(soup) - html_file = self.get_page_slug(url) if url != index else "index.html" - if (html_file in processed_pages.values()): - log.error(f"Found duplicate pages with slug '{html_file}' - previous one will be overwritten." + - "make sure that your notion pages names or custom slugs in the configuration files are unique") - log.info(f"Exporting page '{url}' as '{html_file}'") - with open(self.dist_folder / html_file, "wb") as f: - f.write(html_str.encode('utf-8').strip()) - processed_pages[url] = html_file + # add our custom logic to all toggle blocks + for toggle_block in soup.findAll("div", {"class": "notion-toggle-block"}): + toggle_id = uuid.uuid4() + toggle_button = toggle_block.select_one("div[role=button]") + toggle_content = toggle_block.find("div", {"class": None, "style": ""}) + if toggle_button and toggle_content: + # add a custom class to the toggle button and content, + # plus a custom attribute sharing a unique uiid so + # we can hook them up with some custom js logic later + toggle_button["class"] = toggle_block.get("class", []) + [ + "loconotion-toggle-button" + ] + toggle_content["class"] = toggle_content.get("class", []) + [ + "loconotion-toggle-content" + ] + toggle_content.attrs["loconotion-toggle-id"] = toggle_button.attrs[ + "loconotion-toggle-id" + ] = toggle_id + # if there are any table views in the page, add links to the title rows + for table_view in soup.findAll("div", {"class": "notion-table-view"}): + for table_row in table_view.findAll( + "div", {"class": "notion-collection-item"} + ): + # for each row, hover the mouse over it to make the open button appear, + # then grab its href and wrap the table row's name into a link + table_row_block_id = table_row["data-block-id"] + table_row_hover_target = self.driver.find_element_by_css_selector( + f"div[data-block-id='{table_row_block_id}'] > div > div" + ) + # need to scroll the row into view or else + # the open button won't visible to selenium + self.driver.execute_script( + "arguments[0].scrollIntoView();", table_row_hover_target + ) + ActionChains(self.driver).move_to_element( + table_row_hover_target + ).perform() + try: + WebDriverWait(self.driver, 5).until( + EC.visibility_of_element_located( + ( + By.CSS_SELECTOR, + f"div[data-block-id='{table_row_block_id}'] > div > a", + ) + ) + ) + except TimeoutException as ex: + log.error( + f"Timeout waiting for the 'open' button to appear for" + f" row in table with block id {table_row_block_id}" + ) + table_row_href = self.driver.find_element_by_css_selector( + f"div[data-block-id='{table_row_block_id}'] > div > a" + ).get_attribute("href") + table_row_href = table_row_href.split("notion.so")[-1] + row_target_span = table_row.find("span") + row_link_wrapper = soup.new_tag( + "a", attrs={"href": table_row_href, "style": "cursor: pointer;"} + ) + row_target_span.wrap(row_link_wrapper) - # parse sub-pages - if (sub_pages and not self.args.get("single_page", False)): - if (processed_pages): log.debug(f"Pages processed so far: {len(processed_pages)}") - for sub_page in sub_pages: - if not sub_page in processed_pages.keys(): - self.parse_page(sub_page, processed_pages = processed_pages, index = index) + # embed custom google font(s) + fonts_selectors = { + "site": "div:not(.notion-code-block)", + "navbar": ".notion-topbar div", + "title": ".notion-page-block > div, .notion-collection_view_page-block > div", + "h1": ".notion-header-block div, notion-page-content >" + " notion-collection_view-block > div:first-child div", + "h2": ".notion-sub_header-block div", + "h3": ".notion-sub_sub_header-block div", + "body": ".notion-app-inner", + "code": ".notion-code-block *", + } + custom_fonts = self.get_page_config(url).get("fonts", {}) + if custom_fonts: + # append a stylesheet importing the google font for each unique font + unique_custom_fonts = set(custom_fonts.values()) + for font in unique_custom_fonts: + if font: + google_fonts_embed_name = font.replace(" ", "+") + font_href = f"https://fonts.googleapis.com/css2?family={google_fonts_embed_name}:wght@500;600;700&display=swap" + custom_font_stylesheet = soup.new_tag( + "link", rel="stylesheet", href=font_href + ) + soup.head.append(custom_font_stylesheet) - - #we're all done! - return processed_pages + # go through each custom font, and add a css rule overriding the font-family + # to the font override stylesheet targetting the appropriate selector + font_override_stylesheet = soup.new_tag("style", type="text/css") + for target, custom_font in custom_fonts.items(): + if custom_font and not target == "site": + log.debug(f"Setting {target} font-family to {custom_font}") + font_override_stylesheet.append( + fonts_selectors[target] + + " {font-family:" + + custom_font + + " !important} " + ) + site_font = custom_fonts.get("site", None) + # process global site font last to more granular settings can override it + if site_font: + log.debug(f"Setting global site font-family to {site_font}"), + font_override_stylesheet.append( + fonts_selectors["site"] + " {font-family:" + site_font + "} " + ) + # finally append the font overrides stylesheets to the page + soup.head.append(font_override_stylesheet) + # inject any custom elements to the page + custom_injects = self.get_page_config(url).get("inject", {}) - def run(self, url): - start_time = time.time() - total_processed_pages = self.parse_page(url) - elapsed_time = time.time() - start_time - formatted_time = '{:02d}:{:02d}:{:02d}'.format(int(elapsed_time // 3600), int(elapsed_time % 3600 // 60), int(elapsed_time % 60)) - log.info(f'Finished!\n\nProcessed {len(total_processed_pages)} pages in {formatted_time}') \ No newline at end of file + def injects_custom_tags(section): + section_custom_injects = custom_injects.get(section, {}) + for tag, elements in section_custom_injects.items(): + for element in elements: + injected_tag = soup.new_tag(tag) + for attr, value in element.items(): + injected_tag[attr] = value + # if the value refers to a file, copy it to the dist folder + if attr.lower() == "href" or attr.lower() == "src": + log.debug(f"Copying injected file '{value}'") + cached_custom_file = self.cache_file( + (Path.cwd() / value.strip("/")) + ) + # destination = (self.dist_folder / source.name) + # shutil.copyfile(source, destination) + injected_tag[attr] = str(cached_custom_file) # source.name + log.debug(f"Injecting <{section}> tag: {str(injected_tag)}") + soup.find(section).append(injected_tag) + + injects_custom_tags("head") + injects_custom_tags("body") + + # inject loconotion's custom stylesheet and script + loconotion_custom_css = self.cache_file(Path("bundles/loconotion.css")) + custom_css = soup.new_tag( + "link", rel="stylesheet", href=str(loconotion_custom_css) + ) + soup.head.insert(-1, custom_css) + loconotion_custom_js = self.cache_file(Path("bundles/loconotion.js")) + custom_script = soup.new_tag( + "script", type="text/javascript", src=str(loconotion_custom_js) + ) + soup.body.insert(-1, custom_script) + + # find sub-pages and clean slugs / links + sub_pages = [] + for a in soup.findAll("a"): + if a["href"].startswith("/"): + sub_page_href = "https://www.notion.so" + a["href"] + # if the link is an anchor link, + # check if the page hasn't already been parsed + if "#" in sub_page_href: + sub_page_href_tokens = sub_page_href.split("#") + sub_page_href = sub_page_href_tokens[0] + a["href"] = "#" + sub_page_href_tokens[-1] + a["class"] = a.get("class", []) + ["loconotion-anchor-link"] + if ( + sub_page_href in processed_pages.keys() + or sub_page_href in sub_pages + ): + log.debug( + f"Original page for anchor link {sub_page_href}" + " already parsed / pending parsing, skipping" + ) + continue + else: + a["href"] = ( + self.get_page_slug(sub_page_href) + if sub_page_href != index + else "index.html" + ) + sub_pages.append(sub_page_href) + log.debug(f"Found link to page {a['href']}") + + # exports the parsed page + html_str = str(soup) + html_file = self.get_page_slug(url) if url != index else "index.html" + if html_file in processed_pages.values(): + log.error( + f"Found duplicate pages with slug '{html_file}' - previous one will be" + " overwritten. Make sure that your notion pages names or custom slugs" + " in the configuration files are unique" + ) + log.info(f"Exporting page '{url}' as '{html_file}'") + with open(self.dist_folder / html_file, "wb") as f: + f.write(html_str.encode("utf-8").strip()) + processed_pages[url] = html_file + + # parse sub-pages + if sub_pages and not self.args.get("single_page", False): + if processed_pages: + log.debug(f"Pages processed so far: {len(processed_pages)}") + for sub_page in sub_pages: + if not sub_page in processed_pages.keys(): + self.parse_page( + sub_page, processed_pages=processed_pages, index=index + ) + + # we're all done! + return processed_pages + + def run(self, url): + start_time = time.time() + tot_processed_pages = self.parse_page(url) + elapsed_time = time.time() - start_time + formatted_time = "{:02d}:{:02d}:{:02d}".format( + int(elapsed_time // 3600), + int(elapsed_time % 3600 // 60), + int(elapsed_time % 60), + tot_processed_pages, + ) + log.info( + f"Finished!\n\nProcessed {len(tot_processed_pages)} pages in {formatted_time}" + ) diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..7404aed --- /dev/null +++ b/poetry.lock @@ -0,0 +1,315 @@ +[[package]] +category = "dev" +description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +name = "appdirs" +optional = false +python-versions = "*" +version = "1.4.4" + +[[package]] +category = "dev" +description = "Classes Without Boilerplate" +name = "attrs" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "19.3.0" + +[package.extras] +azure-pipelines = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "pytest-azurepipelines"] +dev = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "sphinx", "pre-commit"] +docs = ["sphinx", "zope.interface"] +tests = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface"] + +[[package]] +category = "main" +description = "Screen-scraping library" +name = "beautifulsoup4" +optional = false +python-versions = "*" +version = "4.9.1" + +[package.dependencies] +soupsieve = [">1.2", "<2.0"] + +[package.extras] +html5lib = ["html5lib"] +lxml = ["lxml"] + +[[package]] +category = "dev" +description = "The uncompromising code formatter." +name = "black" +optional = false +python-versions = ">=3.6" +version = "19.10b0" + +[package.dependencies] +appdirs = "*" +attrs = ">=18.1.0" +click = ">=6.5" +pathspec = ">=0.6,<1" +regex = "*" +toml = ">=0.9.4" +typed-ast = ">=1.4.0" + +[package.extras] +d = ["aiohttp (>=3.3.2)", "aiohttp-cors"] + +[[package]] +category = "main" +description = "Python package for providing Mozilla's CA Bundle." +name = "certifi" +optional = false +python-versions = "*" +version = "2020.4.5.1" + +[[package]] +category = "main" +description = "Universal encoding detector for Python 2 and 3" +name = "chardet" +optional = false +python-versions = "*" +version = "3.0.4" + +[[package]] +category = "main" +description = "Automatically install chromedriver that supports the currently installed version of chrome." +name = "chromedriver-autoinstaller" +optional = false +python-versions = ">=3" +version = "0.2.0" + +[[package]] +category = "dev" +description = "Composable command line interface toolkit" +name = "click" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +version = "7.1.2" + +[[package]] +category = "main" +description = "Cross-platform colored terminal text." +name = "colorama" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +version = "0.4.3" + +[[package]] +category = "main" +description = "A CSS Cascading Style Sheets library for Python" +name = "cssutils" +optional = false +python-versions = "*" +version = "1.0.2" + +[[package]] +category = "main" +description = "Internationalized Domain Names in Applications (IDNA)" +name = "idna" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "2.9" + +[[package]] +category = "dev" +description = "Utility library for gitignore style pattern matching of file paths." +name = "pathspec" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +version = "0.8.0" + +[[package]] +category = "dev" +description = "Alternative regular expression module, to replace re." +name = "regex" +optional = false +python-versions = "*" +version = "2020.5.14" + +[[package]] +category = "main" +description = "Python HTTP for Humans." +name = "requests" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +version = "2.23.0" + +[package.dependencies] +certifi = ">=2017.4.17" +chardet = ">=3.0.2,<4" +idna = ">=2.5,<3" +urllib3 = ">=1.21.1,<1.25.0 || >1.25.0,<1.25.1 || >1.25.1,<1.26" + +[package.extras] +security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)"] +socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7)", "win-inet-pton"] + +[[package]] +category = "main" +description = "Python bindings for Selenium" +name = "selenium" +optional = false +python-versions = "*" +version = "3.141.0" + +[package.dependencies] +urllib3 = "*" + +[[package]] +category = "main" +description = "A modern CSS selector implementation for Beautiful Soup." +name = "soupsieve" +optional = false +python-versions = "*" +version = "1.9.6" + +[[package]] +category = "main" +description = "Python Library for Tom's Obvious, Minimal Language" +name = "toml" +optional = false +python-versions = "*" +version = "0.10.1" + +[[package]] +category = "dev" +description = "a fork of Python 2 and 3 ast modules with type comment support" +name = "typed-ast" +optional = false +python-versions = "*" +version = "1.4.1" + +[[package]] +category = "main" +description = "HTTP library with thread-safe connection pooling, file post, and more." +name = "urllib3" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" +version = "1.25.9" + +[package.extras] +brotli = ["brotlipy (>=0.6.0)"] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=0.14)", "ipaddress"] +socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7,<2.0)"] + +[metadata] +content-hash = "ef223e0d435f4ab7f38a6499586aecdb96924ccb7bd59cd0982d0496479ad60f" +python-versions = "^3.7" + +[metadata.files] +appdirs = [ + {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"}, + {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"}, +] +attrs = [ + {file = "attrs-19.3.0-py2.py3-none-any.whl", hash = "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c"}, + {file = "attrs-19.3.0.tar.gz", hash = "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"}, +] +beautifulsoup4 = [ + {file = "beautifulsoup4-4.9.1-py2-none-any.whl", hash = "sha256:e718f2342e2e099b640a34ab782407b7b676f47ee272d6739e60b8ea23829f2c"}, + {file = "beautifulsoup4-4.9.1-py3-none-any.whl", hash = "sha256:a6237df3c32ccfaee4fd201c8f5f9d9df619b93121d01353a64a73ce8c6ef9a8"}, + {file = "beautifulsoup4-4.9.1.tar.gz", hash = "sha256:73cc4d115b96f79c7d77c1c7f7a0a8d4c57860d1041df407dd1aae7f07a77fd7"}, +] +black = [ + {file = "black-19.10b0-py36-none-any.whl", hash = "sha256:1b30e59be925fafc1ee4565e5e08abef6b03fe455102883820fe5ee2e4734e0b"}, + {file = "black-19.10b0.tar.gz", hash = "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539"}, +] +certifi = [ + {file = "certifi-2020.4.5.1-py2.py3-none-any.whl", hash = "sha256:1d987a998c75633c40847cc966fcf5904906c920a7f17ef374f5aa4282abd304"}, + {file = "certifi-2020.4.5.1.tar.gz", hash = "sha256:51fcb31174be6e6664c5f69e3e1691a2d72a1a12e90f872cbdb1567eb47b6519"}, +] +chardet = [ + {file = "chardet-3.0.4-py2.py3-none-any.whl", hash = "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"}, + {file = "chardet-3.0.4.tar.gz", hash = "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"}, +] +chromedriver-autoinstaller = [ + {file = "chromedriver-autoinstaller-0.2.0.tar.gz", hash = "sha256:e6aadc277f2c3a1d247541eecb60bfdeabb3250c56ad9998595420840d1c7f71"}, + {file = "chromedriver_autoinstaller-0.2.0-py3-none-any.whl", hash = "sha256:290a72a1e60e5d806ac0d7cc14bd6aa0746bf8e007899efca48b25eb239ea851"}, +] +click = [ + {file = "click-7.1.2-py2.py3-none-any.whl", hash = "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"}, + {file = "click-7.1.2.tar.gz", hash = "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a"}, +] +colorama = [ + {file = "colorama-0.4.3-py2.py3-none-any.whl", hash = "sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff"}, + {file = "colorama-0.4.3.tar.gz", hash = "sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1"}, +] +cssutils = [ + {file = "cssutils-1.0.2-py3-none-any.whl", hash = "sha256:c74dbe19c92f5052774eadb15136263548dd013250f1ed1027988e7fef125c8d"}, + {file = "cssutils-1.0.2.tar.gz", hash = "sha256:a2fcf06467553038e98fea9cfe36af2bf14063eb147a70958cfcaa8f5786acaf"}, +] +idna = [ + {file = "idna-2.9-py2.py3-none-any.whl", hash = "sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa"}, + {file = "idna-2.9.tar.gz", hash = "sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb"}, +] +pathspec = [ + {file = "pathspec-0.8.0-py2.py3-none-any.whl", hash = "sha256:7d91249d21749788d07a2d0f94147accd8f845507400749ea19c1ec9054a12b0"}, + {file = "pathspec-0.8.0.tar.gz", hash = "sha256:da45173eb3a6f2a5a487efba21f050af2b41948be6ab52b6a1e3ff22bb8b7061"}, +] +regex = [ + {file = "regex-2020.5.14-cp27-cp27m-win32.whl", hash = "sha256:e565569fc28e3ba3e475ec344d87ed3cd8ba2d575335359749298a0899fe122e"}, + {file = "regex-2020.5.14-cp27-cp27m-win_amd64.whl", hash = "sha256:d466967ac8e45244b9dfe302bbe5e3337f8dc4dec8d7d10f5e950d83b140d33a"}, + {file = "regex-2020.5.14-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:27ff7325b297fb6e5ebb70d10437592433601c423f5acf86e5bc1ee2919b9561"}, + {file = "regex-2020.5.14-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:ea55b80eb0d1c3f1d8d784264a6764f931e172480a2f1868f2536444c5f01e01"}, + {file = "regex-2020.5.14-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:c9bce6e006fbe771a02bda468ec40ffccbf954803b470a0345ad39c603402577"}, + {file = "regex-2020.5.14-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:d881c2e657c51d89f02ae4c21d9adbef76b8325fe4d5cf0e9ad62f850f3a98fd"}, + {file = "regex-2020.5.14-cp36-cp36m-win32.whl", hash = "sha256:99568f00f7bf820c620f01721485cad230f3fb28f57d8fbf4a7967ec2e446994"}, + {file = "regex-2020.5.14-cp36-cp36m-win_amd64.whl", hash = "sha256:70c14743320a68c5dac7fc5a0f685be63bc2024b062fe2aaccc4acc3d01b14a1"}, + {file = "regex-2020.5.14-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:a7c37f048ec3920783abab99f8f4036561a174f1314302ccfa4e9ad31cb00eb4"}, + {file = "regex-2020.5.14-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:89d76ce33d3266173f5be80bd4efcbd5196cafc34100fdab814f9b228dee0fa4"}, + {file = "regex-2020.5.14-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:51f17abbe973c7673a61863516bdc9c0ef467407a940f39501e786a07406699c"}, + {file = "regex-2020.5.14-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:ce5cc53aa9fbbf6712e92c7cf268274eaff30f6bd12a0754e8133d85a8fb0f5f"}, + {file = "regex-2020.5.14-cp37-cp37m-win32.whl", hash = "sha256:8044d1c085d49673aadb3d7dc20ef5cb5b030c7a4fa253a593dda2eab3059929"}, + {file = "regex-2020.5.14-cp37-cp37m-win_amd64.whl", hash = "sha256:c2062c7d470751b648f1cacc3f54460aebfc261285f14bc6da49c6943bd48bdd"}, + {file = "regex-2020.5.14-cp38-cp38-manylinux1_i686.whl", hash = "sha256:329ba35d711e3428db6b45a53b1b13a0a8ba07cbbcf10bbed291a7da45f106c3"}, + {file = "regex-2020.5.14-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:579ea215c81d18da550b62ff97ee187b99f1b135fd894a13451e00986a080cad"}, + {file = "regex-2020.5.14-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:3a9394197664e35566242686d84dfd264c07b20f93514e2e09d3c2b3ffdf78fe"}, + {file = "regex-2020.5.14-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:ce367d21f33e23a84fb83a641b3834dd7dd8e9318ad8ff677fbfae5915a239f7"}, + {file = "regex-2020.5.14-cp38-cp38-win32.whl", hash = "sha256:1386e75c9d1574f6aa2e4eb5355374c8e55f9aac97e224a8a5a6abded0f9c927"}, + {file = "regex-2020.5.14-cp38-cp38-win_amd64.whl", hash = "sha256:7e61be8a2900897803c293247ef87366d5df86bf701083b6c43119c7c6c99108"}, + {file = "regex-2020.5.14.tar.gz", hash = "sha256:ce450ffbfec93821ab1fea94779a8440e10cf63819be6e176eb1973a6017aff5"}, +] +requests = [ + {file = "requests-2.23.0-py2.py3-none-any.whl", hash = "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee"}, + {file = "requests-2.23.0.tar.gz", hash = "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6"}, +] +selenium = [ + {file = "selenium-3.141.0-py2.py3-none-any.whl", hash = "sha256:2d7131d7bc5a5b99a2d9b04aaf2612c411b03b8ca1b1ee8d3de5845a9be2cb3c"}, + {file = "selenium-3.141.0.tar.gz", hash = "sha256:deaf32b60ad91a4611b98d8002757f29e6f2c2d5fcaf202e1c9ad06d6772300d"}, +] +soupsieve = [ + {file = "soupsieve-1.9.6-py2.py3-none-any.whl", hash = "sha256:feb1e937fa26a69e08436aad4a9037cd7e1d4c7212909502ba30701247ff8abd"}, + {file = "soupsieve-1.9.6.tar.gz", hash = "sha256:7985bacc98c34923a439967c1a602dc4f1e15f923b6fcf02344184f86cc7efaa"}, +] +toml = [ + {file = "toml-0.10.1-py2.py3-none-any.whl", hash = "sha256:bda89d5935c2eac546d648028b9901107a595863cb36bae0c73ac804a9b4ce88"}, + {file = "toml-0.10.1.tar.gz", hash = "sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f"}, +] +typed-ast = [ + {file = "typed_ast-1.4.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3"}, + {file = "typed_ast-1.4.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb"}, + {file = "typed_ast-1.4.1-cp35-cp35m-win32.whl", hash = "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919"}, + {file = "typed_ast-1.4.1-cp35-cp35m-win_amd64.whl", hash = "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01"}, + {file = "typed_ast-1.4.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75"}, + {file = "typed_ast-1.4.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652"}, + {file = "typed_ast-1.4.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7"}, + {file = "typed_ast-1.4.1-cp36-cp36m-win32.whl", hash = "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1"}, + {file = "typed_ast-1.4.1-cp36-cp36m-win_amd64.whl", hash = "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa"}, + {file = "typed_ast-1.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614"}, + {file = "typed_ast-1.4.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41"}, + {file = "typed_ast-1.4.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b"}, + {file = "typed_ast-1.4.1-cp37-cp37m-win32.whl", hash = "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe"}, + {file = "typed_ast-1.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355"}, + {file = "typed_ast-1.4.1-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6"}, + {file = "typed_ast-1.4.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907"}, + {file = "typed_ast-1.4.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d"}, + {file = "typed_ast-1.4.1-cp38-cp38-win32.whl", hash = "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c"}, + {file = "typed_ast-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4"}, + {file = "typed_ast-1.4.1-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34"}, + {file = "typed_ast-1.4.1.tar.gz", hash = "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b"}, +] +urllib3 = [ + {file = "urllib3-1.25.9-py2.py3-none-any.whl", hash = "sha256:88206b0eb87e6d677d424843ac5209e3fb9d0190d0ee169599165ec25e9d9115"}, + {file = "urllib3-1.25.9.tar.gz", hash = "sha256:3018294ebefce6572a474f0604c2021e33b3fd8006ecd11d62107a5d2a963527"}, +] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..6017b2d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,22 @@ +[tool.poetry] +name = "loconotion" +version = "0.1.0" +description = "" +authors = ["Leonardo Cavaletti "] + +[tool.poetry.dependencies] +python = "^3.7" +beautifulsoup4 = "^4.9.1" +chromedriver-autoinstaller = "^0.2.0" +colorama = "^0.4.3" +cssutils = "^1.0.2" +requests = "^2.23.0" +selenium = "^3.141.0" +toml = "^0.10.1" + +[tool.poetry.dev-dependencies] +black = "^19.10b0" + +[build-system] +requires = ["poetry>=0.12"] +build-backend = "poetry.masonry.api" diff --git a/requirements.txt b/requirements.txt index 45ade18..dfefd6d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,37 @@ -beautifulsoup4==4.9.1 -certifi==2020.4.5.1 -chardet==3.0.4 -chromedriver-autoinstaller==0.2.0 -colorama==0.4.3 -cssutils==1.0.2 -idna==2.9 -requests==2.23.0 -selenium==3.141.0 -soupsieve==2.0.1 -toml==0.10.1 -urllib3==1.25.9 +beautifulsoup4==4.9.1 \ + --hash=sha256:e718f2342e2e099b640a34ab782407b7b676f47ee272d6739e60b8ea23829f2c \ + --hash=sha256:a6237df3c32ccfaee4fd201c8f5f9d9df619b93121d01353a64a73ce8c6ef9a8 \ + --hash=sha256:73cc4d115b96f79c7d77c1c7f7a0a8d4c57860d1041df407dd1aae7f07a77fd7 +certifi==2020.4.5.1 \ + --hash=sha256:1d987a998c75633c40847cc966fcf5904906c920a7f17ef374f5aa4282abd304 \ + --hash=sha256:51fcb31174be6e6664c5f69e3e1691a2d72a1a12e90f872cbdb1567eb47b6519 +chardet==3.0.4 \ + --hash=sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691 \ + --hash=sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae +chromedriver-autoinstaller==0.2.0 \ + --hash=sha256:e6aadc277f2c3a1d247541eecb60bfdeabb3250c56ad9998595420840d1c7f71 \ + --hash=sha256:290a72a1e60e5d806ac0d7cc14bd6aa0746bf8e007899efca48b25eb239ea851 +colorama==0.4.3 \ + --hash=sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff \ + --hash=sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1 +cssutils==1.0.2 \ + --hash=sha256:c74dbe19c92f5052774eadb15136263548dd013250f1ed1027988e7fef125c8d \ + --hash=sha256:a2fcf06467553038e98fea9cfe36af2bf14063eb147a70958cfcaa8f5786acaf +idna==2.9 \ + --hash=sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa \ + --hash=sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb +requests==2.23.0 \ + --hash=sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee \ + --hash=sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6 +selenium==3.141.0 \ + --hash=sha256:2d7131d7bc5a5b99a2d9b04aaf2612c411b03b8ca1b1ee8d3de5845a9be2cb3c \ + --hash=sha256:deaf32b60ad91a4611b98d8002757f29e6f2c2d5fcaf202e1c9ad06d6772300d +soupsieve==1.9.6 \ + --hash=sha256:feb1e937fa26a69e08436aad4a9037cd7e1d4c7212909502ba30701247ff8abd \ + --hash=sha256:7985bacc98c34923a439967c1a602dc4f1e15f923b6fcf02344184f86cc7efaa +toml==0.10.1 \ + --hash=sha256:bda89d5935c2eac546d648028b9901107a595863cb36bae0c73ac804a9b4ce88 \ + --hash=sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f +urllib3==1.25.9 \ + --hash=sha256:88206b0eb87e6d677d424843ac5209e3fb9d0190d0ee169599165ec25e9d9115 \ + --hash=sha256:3018294ebefce6572a474f0604c2021e33b3fd8006ecd11d62107a5d2a963527