Set up poetry and black formatter

This commit is contained in:
Leonardo Cavaletti 2020-05-24 17:13:48 +01:00
parent bd76bc3089
commit 7e5a11cb6a
8 changed files with 1098 additions and 573 deletions

9
.gitignore vendored
View File

@ -108,10 +108,7 @@ dmypy.json
.pyre/ .pyre/
# End of https://www.gitignore.io/api/python # End of https://www.gitignore.io/api/python
.env
.vscode .vscode
env temp
dist/* logs
test/*
logs/*
*.bat

View File

@ -40,11 +40,13 @@ It does, but I wasn't really happy with the styling - the pages looked a bit ugl
## Installation & Requirements ## Installation & Requirements
`pip install -r requirements.txt` Make sure you're in your virtual environment of choiche, then run
- `poetry install --no-dev` if you have [Poetry](https://python-poetry.org/) installed
- `pip install -r requirements.txt` otherwise
This script uses [ChromeDriver](chromedriver.chromium.org) to automate the Google Chrome browser - therefore Google Chrome needs to be installed in order to work. This script uses [ChromeDriver](chromedriver.chromium.org) to automate the Google Chrome browser - therefore Google Chrome needs to be installed in order to work.
The script comes bundled with the default windows chromedriver executable. On Max / Linux, download the right distribution for you from https://chromedriver.chromium.org/downloads and place the executable in this folder. Alternatively, use the `--chromedriver` argument to specify its path at runtime. The script will automatically try to download and use the appropriate chromedriver distribution for your OS and Chrome version. If this doesn't work, download the right version for you from https://chromedriver.chromium.org/downloads and use the `--chromedriver` argument to specify its path at runtime.
## Simple Usage ## Simple Usage

View File

@ -16,15 +16,37 @@ except ModuleNotFoundError as error:
from notionparser import Parser from notionparser import Parser
def main(): def main():
# set up argument parser # set up argument parser
argparser = argparse.ArgumentParser(description='Generate static websites from Notion.so pages') argparser = argparse.ArgumentParser(
argparser.add_argument('target', help='The config file containing the site properties, or the url of the Notion.so page to generate the site from') description="Generate static websites from Notion.so pages"
argparser.add_argument('--chromedriver', help='Use a specific chromedriver executable instead of the auto-installing one') )
argparser.add_argument("--single-page", action="store_true", help="Only parse the first page, then stop") argparser.add_argument(
argparser.add_argument('--clean', action='store_true', help='Delete all previously cached files for the site before generating it') "target",
argparser.add_argument('--non-headless', action='store_true', help='Run chromedriver in non-headless mode') help="The config file containing the site properties, or the url"
argparser.add_argument("-v", "--verbose", action="store_true", help="Increasite output log verbosity") " of the Notion.so page to generate the site from",
)
argparser.add_argument(
"--chromedriver",
help="Use a specific chromedriver executable instead of the auto-installing one",
)
argparser.add_argument(
"--single-page", action="store_true", help="Only parse the first page, then stop"
)
argparser.add_argument(
"--clean",
action="store_true",
help="Delete all previously cached files for the site before generating it",
)
argparser.add_argument(
"--non-headless",
action="store_true",
help="Run chromedriver in non-headless mode",
)
argparser.add_argument(
"-v", "--verbose", action="store_true", help="Increasite output log verbosity"
)
args = argparser.parse_args() args = argparser.parse_args()
# set up some pretty logs # set up some pretty logs
@ -41,7 +63,7 @@ def main():
logging.INFO: colorama.Fore.BLUE, logging.INFO: colorama.Fore.BLUE,
logging.WARNING: colorama.Fore.YELLOW, logging.WARNING: colorama.Fore.YELLOW,
logging.ERROR: colorama.Fore.RED, logging.ERROR: colorama.Fore.RED,
logging.CRITICAL: colorama.Back.RED logging.CRITICAL: colorama.Back.RED,
} }
class ColorFormatter(logging.Formatter): class ColorFormatter(logging.Formatter):
@ -57,11 +79,14 @@ def main():
) )
return super(ColorFormatter, self).format(new_record, *args, **kwargs) return super(ColorFormatter, self).format(new_record, *args, **kwargs)
log_screen_handler.setFormatter(ColorFormatter(fmt='%(asctime)s %(levelname)-8s %(message)s', log_screen_handler.setFormatter(
ColorFormatter(
fmt="%(asctime)s %(levelname)-8s %(message)s",
datefmt="{color_begin}[%H:%M:%S]{color_end}".format( datefmt="{color_begin}[%H:%M:%S]{color_end}".format(
color_begin=colorama.Style.DIM, color_begin=colorama.Style.DIM, color_end=colorama.Style.RESET_ALL
color_end=colorama.Style.RESET_ALL ),
))) )
)
except ModuleNotFoundError as identifier: except ModuleNotFoundError as identifier:
pass pass
@ -70,7 +95,7 @@ def main():
if urllib.parse.urlparse(args.target).scheme: if urllib.parse.urlparse(args.target).scheme:
try: try:
response = requests.get(args.target) response = requests.get(args.target)
if ("notion.so" in args.target): if "notion.so" in args.target:
log.info("Initialising parser with simple page url") log.info("Initialising parser with simple page url")
config = {"page": args.target} config = {"page": args.target}
Parser(config=config, args=vars(args)) Parser(config=config, args=vars(args))
@ -88,14 +113,15 @@ def main():
else: else:
log.critical(f"Config file {args.target} does not exists") log.critical(f"Config file {args.target} does not exists")
except FileNotFoundError as e: except FileNotFoundError as e:
log.critical(f'FileNotFoundError: {e}') log.critical(f"FileNotFoundError: {e}")
sys.exit(0) sys.exit(0)
if __name__ == '__main__':
if __name__ == "__main__":
try: try:
main() main()
except KeyboardInterrupt: except KeyboardInterrupt:
log.critical('Interrupted by user') log.critical("Interrupted by user")
try: try:
sys.exit(0) sys.exit(0)
except SystemExit: except SystemExit:

View File

@ -2,38 +2,52 @@ import logging
log = logging.getLogger(f"loconotion.{__name__}") log = logging.getLogger(f"loconotion.{__name__}")
class notion_page_loaded(object): class notion_page_loaded(object):
"""An expectation for checking that a notion page has loaded. """An expectation for checking that a notion page has loaded."""
"""
def __init__(self, url): def __init__(self, url):
self.url = url self.url = url
def __call__(self, driver): def __call__(self, driver):
notion_presence = len(driver.find_elements_by_class_name("notion-presence-container")) notion_presence = len(
collection_view_block = len(driver.find_elements_by_class_name("notion-collection_view_page-block")); driver.find_elements_by_class_name("notion-presence-container")
collection_search = len(driver.find_elements_by_class_name("collectionSearch")); )
collection_view_block = len(
driver.find_elements_by_class_name("notion-collection_view_page-block")
)
collection_search = len(driver.find_elements_by_class_name("collectionSearch"))
# embed_ghosts = len(driver.find_elements_by_css_selector("div[embed-ghost]")); # embed_ghosts = len(driver.find_elements_by_css_selector("div[embed-ghost]"));
log.debug(f"Waiting for page content to load (presence container: {notion_presence}, loaders: {loading_spinners} )") log.debug(
if (notion_presence and not loading_spinners): f"Waiting for page content to load"
f" (presence container: {notion_presence}, loaders: {loading_spinners} )"
)
if notion_presence and not loading_spinners:
return True return True
else: else:
return False return False
class toggle_block_has_opened(object): class toggle_block_has_opened(object):
"""An expectation for checking that a notion toggle block has been opened. """An expectation for checking that a notion toggle block has been opened.
It does so by checking if the div hosting the content has enough children, It does so by checking if the div hosting the content has enough children,
and the abscence of the loading spinner. and the abscence of the loading spinner."""
"""
def __init__(self, toggle_block): def __init__(self, toggle_block):
self.toggle_block = toggle_block self.toggle_block = toggle_block
def __call__(self, driver): def __call__(self, driver):
toggle_content = self.toggle_block.find_element_by_css_selector("div:not([style]") toggle_content = self.toggle_block.find_element_by_css_selector("div:not([style]")
if (toggle_content): if toggle_content:
content_children = len(toggle_content.find_elements_by_tag_name("div")) content_children = len(toggle_content.find_elements_by_tag_name("div"))
is_loading = len(self.toggle_block.find_elements_by_class_name("loading-spinner")); is_loading = len(
log.debug(f"Waiting for toggle block to load ({content_children} children so far and {is_loading} loaders)") self.toggle_block.find_elements_by_class_name("loading-spinner")
if (content_children > 3 and not is_loading): )
log.debug(
f"Waiting for toggle block to load"
f" ({content_children} children so far and {is_loading} loaders)"
)
if content_children > 3 and not is_loading:
return True return True
else: else:
return False return False

View File

@ -25,6 +25,7 @@ try:
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import requests import requests
import cssutils import cssutils
cssutils.log.setLevel(logging.CRITICAL) # removes warning logs from cssutils cssutils.log.setLevel(logging.CRITICAL) # removes warning logs from cssutils
except ModuleNotFoundError as error: except ModuleNotFoundError as error:
log.critical(f"ModuleNotFoundError: {error}. have your installed the requirements?") log.critical(f"ModuleNotFoundError: {error}. have your installed the requirements?")
@ -32,14 +33,18 @@ except ModuleNotFoundError as error:
from conditions import toggle_block_has_opened from conditions import toggle_block_has_opened
class Parser():
class Parser:
def __init__(self, config={}, args={}): def __init__(self, config={}, args={}):
self.config = config self.config = config
self.args = args self.args = args
url = self.config.get("page", None) url = self.config.get("page", None)
if not url: if not url:
log.critical("No initial page url specified. If passing a configuration file," + log.critical(
"make sure it contains a 'page' key with the url of the notion.so page to parse") "No initial page url specified. If passing a configuration file,"
" make sure it contains a 'page' key with the url of the notion.so"
" page to parse"
)
return return
# get the site name from the config, or make it up by cleaning the target page's slug # get the site name from the config, or make it up by cleaning the target page's slug
@ -50,7 +55,7 @@ class Parser():
log.info(f"Setting output path to '{self.dist_folder}'") log.info(f"Setting output path to '{self.dist_folder}'")
# check if the argument to clean the dist folder was passed # check if the argument to clean the dist folder was passed
if (self.args.get("clean", False)): if self.args.get("clean", False):
try: try:
shutil.rmtree(self.dist_folder) shutil.rmtree(self.dist_folder)
log.info(f"Removing previously cached files in '{self.dist_folder}'") log.info(f"Removing previously cached files in '{self.dist_folder}'")
@ -69,25 +74,36 @@ class Parser():
site_config = self.config.get("site", {}) site_config = self.config.get("site", {})
# check if there's anything wrong with the site config # check if there's anything wrong with the site config
if (site_config.get("slug", None)): if site_config.get("slug", None):
log.error("'slug' parameter has no effect in the [site] table, and should only present in page tables.") log.error(
del site_config['slug'] "'slug' parameter has no effect in the [site] table, "
"and should only present in page tables."
)
del site_config["slug"]
# find a table in the configuration file whose key contains the passed token string # find a table in the configuration file whose key contains the passed token string
site_pages_config = self.config.get("pages", {}) site_pages_config = self.config.get("pages", {})
matching_pages_config = [value for key, value in site_pages_config.items() if key.lower() in token] matching_pages_config = [
if (matching_pages_config): value for key, value in site_pages_config.items() if key.lower() in token
if (len(matching_pages_config) > 1): ]
log.error(f"multiple matching page config tokens found for {token} in configuration file. Make sure pages urls / slugs are unique") if matching_pages_config:
if len(matching_pages_config) > 1:
log.error(
f"multiple matching page config tokens found for {token}"
" in configuration file. Make sure pages urls / slugs are unique"
)
return site_config return site_config
else: else:
# if found, merge it on top of the global site configuration table # if found, merge it on top of the global site configuration table
# log.debug(f"Config table found for page with token {token}") # log.debug(f"Config table found for page with token {token}")
matching_page_config = matching_pages_config[0] matching_page_config = matching_pages_config[0]
if (type(matching_page_config) is dict): if type(matching_page_config) is dict:
return {**site_config, **matching_page_config} return {**site_config, **matching_page_config}
else: else:
log.error(f"Matching page configuration for {url} was not a dict: {matching_page_config} - something went wrong") log.error(
f"Matching page configuration for {url} was not a dict:"
f" {matching_page_config} - something went wrong"
)
return site_config return site_config
else: else:
# log.debug(f"No config table found for page token {token}, using global site config table") # log.debug(f"No config table found for page token {token}, using global site config table")
@ -102,11 +118,11 @@ class Parser():
else: else:
# if not, clean up the existing slug # if not, clean up the existing slug
path = urllib.parse.urlparse(url).path.strip("/") path = urllib.parse.urlparse(url).path.strip("/")
if ("-" in path and len(path.split("-")) > 1): if "-" in path and len(path.split("-")) > 1:
# a standard notion page looks like the-page-title-[uiid] # a standard notion page looks like the-page-title-[uiid]
# strip the uuid and keep the page title only # strip the uuid and keep the page title only
path = "-".join(path.split("-")[:-1]).lower() path = "-".join(path.split("-")[:-1]).lower()
elif ("?" in path): elif "?" in path:
# database pages just have an uiid and a query param # database pages just have an uiid and a query param
# not much to do here, just get rid of the query param # not much to do here, just get rid of the query param
path = path.split("?")[0].lower() path = path.split("?")[0].lower()
@ -118,19 +134,19 @@ class Parser():
# if no filename specificed, generate an hashed id based the query-less url, # if no filename specificed, generate an hashed id based the query-less url,
# so we avoid re-downloading / caching files we already have # so we avoid re-downloading / caching files we already have
if (not filename): if not filename:
parsed_url = urllib.parse.urlparse(url) parsed_url = urllib.parse.urlparse(url)
queryless_url = parsed_url.netloc + parsed_url.path queryless_url = parsed_url.netloc + parsed_url.path
query_params = urllib.parse.parse_qs(parsed_url.query) query_params = urllib.parse.parse_qs(parsed_url.query)
# if any of the query params contains a size parameters store it in the has # if any of the query params contains a size parameters store it in the has
# so we can download other higher-resolution versions if needed # so we can download other higher-resolution versions if needed
if ("width" in query_params.keys()): if "width" in query_params.keys():
queryless_url = queryless_url + f"?width={query_params['width']}" queryless_url = queryless_url + f"?width={query_params['width']}"
filename = hashlib.sha1(str.encode(queryless_url)).hexdigest(); filename = hashlib.sha1(str.encode(queryless_url)).hexdigest()
destination = self.dist_folder / filename destination = self.dist_folder / filename
# check if there are any files matching the filename, ignoring extension # check if there are any files matching the filename, ignoring extension
matching_file = glob.glob(str(destination.with_suffix('.*'))) matching_file = glob.glob(str(destination.with_suffix(".*")))
if not matching_file: if not matching_file:
# if url has a network scheme, download the file # if url has a network scheme, download the file
if "http" in urllib.parse.urlparse(url).scheme: if "http" in urllib.parse.urlparse(url).scheme:
@ -146,11 +162,11 @@ class Parser():
# if the filename does not have an extension at this point, # if the filename does not have an extension at this point,
# try to infer it from the url, and if not possible, # try to infer it from the url, and if not possible,
# from the content-type header mimetype # from the content-type header mimetype
if (not destination.suffix): if not destination.suffix:
file_extension = Path(urllib.parse.urlparse(url).path).suffix file_extension = Path(urllib.parse.urlparse(url).path).suffix
if (not file_extension): if not file_extension:
content_type = response.headers.get('content-type') content_type = response.headers.get("content-type")
if (content_type): if content_type:
file_extension = mimetypes.guess_extension(content_type) file_extension = mimetypes.guess_extension(content_type)
destination = destination.with_suffix(file_extension) destination = destination.with_suffix(file_extension)
@ -177,51 +193,65 @@ class Parser():
def init_chromedriver(self): def init_chromedriver(self):
chromedriver_path = self.args.get("chromedriver") chromedriver_path = self.args.get("chromedriver")
if (not chromedriver_path): if not chromedriver_path:
try: try:
chromedriver_path = chromedriver_autoinstaller.install() chromedriver_path = chromedriver_autoinstaller.install()
except Exception as exception: except Exception as exception:
log.critical(f"Failed to install the built-in chromedriver: {exception}\n" + log.critical(
"download the correct version for your system at https://chromedriver.chromium.org/downloads" + f"Failed to install the built-in chromedriver: {exception}\n"
"and use the --chromedriver argument to point to the chromedriver executable") "\nDownload the correct version for your system at"
" https://chromedriver.chromium.org/downloads and use the"
" --chromedriver argument to point to the chromedriver executable"
)
sys.exit() sys.exit()
log.info(f"Initialising chromedriver at {chromedriver_path}") log.info(f"Initialising chromedriver at {chromedriver_path}")
logs_path = (Path.cwd() / "logs" / "webdrive.log") logs_path = Path.cwd() / "logs" / "webdrive.log"
logs_path.parent.mkdir(parents=True, exist_ok=True) logs_path.parent.mkdir(parents=True, exist_ok=True)
chrome_options = Options() chrome_options = Options()
if (not self.args.get("non_headless", False)): if not self.args.get("non_headless", False):
chrome_options.add_argument("--headless") chrome_options.add_argument("--headless")
chrome_options.add_argument("window-size=1920,1080") chrome_options.add_argument("window-size=1920,1080")
chrome_options.add_argument("--log-level=3"); chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--silent"); chrome_options.add_argument("--silent")
chrome_options.add_argument("--disable-logging") chrome_options.add_argument("--disable-logging")
# removes the 'DevTools listening' log message # removes the 'DevTools listening' log message
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
return webdriver.Chrome( return webdriver.Chrome(
executable_path=str(chromedriver_path), executable_path=str(chromedriver_path),
service_log_path=str(logs_path), service_log_path=str(logs_path),
options=chrome_options) options=chrome_options,
)
def parse_page(self, url, processed_pages={}, index=None): def parse_page(self, url, processed_pages={}, index=None):
# if this is the first page being parse, set it as the index.html # if this is the first page being parse, set it as the index.html
if (not index): if not index:
index = url; index = url
log.info(f"Parsing page '{url}'") log.info(f"Parsing page '{url}'")
log.debug(f"Using page config: {self.get_page_config(url)}") log.debug(f"Using page config: {self.get_page_config(url)}")
self.driver.get(url) self.driver.get(url)
# if ("This content does not exist" in self.driver.page_source): # if "This content does not exist" in self.driver.page_source:
# log.error(f"No content found in {url}. Are you sure the page is set to public?") # log.error(
# f"No content found in {url}."
# " Are you sure the page is set to public?"
# )
# return # return
try: try:
# WebDriverWait(self.driver, 10).until(notion_page_loaded()) # WebDriverWait(self.driver, 10).until(notion_page_loaded())
WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'notion-presence-container'))) WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located(
(By.CLASS_NAME, "notion-presence-container")
)
)
except TimeoutException as ex: except TimeoutException as ex:
log.critical("Timeout waiting for page content to load, or no content found. Are you sure the page is set to public?") log.critical(
"Timeout waiting for page content to load, or no content found."
" Are you sure the page is set to public?"
)
return return
# cooldown to allow eventual database items to load # cooldown to allow eventual database items to load
@ -231,185 +261,254 @@ class Parser():
# function to expand all the toggle block in the page to make their content visible # function to expand all the toggle block in the page to make their content visible
# so we can hook up our custom toggle logic afterwards # so we can hook up our custom toggle logic afterwards
def open_toggle_blocks(exclude=[]): def open_toggle_blocks(exclude=[]):
opened_toggles = exclude; opened_toggles = exclude
toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block") toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block")
log.debug(f"Opening {len(toggle_blocks)} new toggle blocks in the page") log.debug(f"Opening {len(toggle_blocks)} new toggle blocks in the page")
for toggle_block in toggle_blocks: for toggle_block in toggle_blocks:
if (not toggle_block in opened_toggles): if not toggle_block in opened_toggles:
toggle_button = toggle_block.find_element_by_css_selector("div[role=button]") toggle_button = toggle_block.find_element_by_css_selector(
"div[role=button]"
)
# check if the toggle is already open by the direction of its arrow # check if the toggle is already open by the direction of its arrow
is_toggled = "(180deg)" in (toggle_button.find_element_by_tag_name("svg").get_attribute("style")) is_toggled = "(180deg)" in (
if (not is_toggled): toggle_button.find_element_by_tag_name("svg").get_attribute(
"style"
)
)
if not is_toggled:
# click on it, then wait until all elements are displayed # click on it, then wait until all elements are displayed
toggle_button.click() toggle_button.click()
try: try:
WebDriverWait(self.driver, 10).until(toggle_block_has_opened(toggle_block)) WebDriverWait(self.driver, 10).until(
toggle_block_has_opened(toggle_block)
)
except TimeoutException as ex: except TimeoutException as ex:
log.warning("Timeout waiting for toggle block to open. Likely it's already open, but doesn't hurt to check.") log.warning(
"Timeout waiting for toggle block to open."
" Likely it's already open, but doesn't hurt to check."
)
except Exception as ex: except Exception as ex:
log.error("Something went wrong with selenium while trying to open a toggle block") log.error(
"Something went wrong while trying to open a toggle block"
)
opened_toggles.append(toggle_block) opened_toggles.append(toggle_block)
# after all toggles have been opened, check the page again to see if # after all toggles have been opened, check the page again to see if
# any toggle block had nested toggle blocks inside them # any toggle block had nested toggle blocks inside them
new_toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block") new_toggle_blocks = self.driver.find_elements_by_class_name(
if (len(new_toggle_blocks) > len(toggle_blocks)): "notion-toggle-block"
)
if len(new_toggle_blocks) > len(toggle_blocks):
# if so, run the function again # if so, run the function again
open_toggle_blocks(opened_toggles) open_toggle_blocks(opened_toggles)
# open the toggle blocks in the page # open the toggle blocks in the page
open_toggle_blocks() open_toggle_blocks()
# creates soup from the page to start parsing # creates soup from the page to start parsing
soup = BeautifulSoup(self.driver.page_source, "html.parser") soup = BeautifulSoup(self.driver.page_source, "html.parser")
# remove scripts and other tags we don't want / need # remove scripts and other tags we don't want / need
for unwanted in soup.findAll('script'): for unwanted in soup.findAll("script"):
unwanted.decompose(); unwanted.decompose()
for intercom_frame in soup.findAll('div',{'id':'intercom-frame'}): for intercom_frame in soup.findAll("div", {"id": "intercom-frame"}):
intercom_frame.decompose(); intercom_frame.decompose()
for intercom_div in soup.findAll('div',{'class':'intercom-lightweight-app'}): for intercom_div in soup.findAll("div", {"class": "intercom-lightweight-app"}):
intercom_div.decompose(); intercom_div.decompose()
for overlay_div in soup.findAll('div',{'class':'notion-overlay-container'}): for overlay_div in soup.findAll("div", {"class": "notion-overlay-container"}):
overlay_div.decompose(); overlay_div.decompose()
for vendors_css in soup.find_all("link", href=lambda x: x and 'vendors~' in x): for vendors_css in soup.find_all("link", href=lambda x: x and "vendors~" in x):
vendors_css.decompose(); vendors_css.decompose()
# clean up the default notion meta tags # clean up the default notion meta tags
for tag in ["description", "twitter:card", "twitter:site", "twitter:title", "twitter:description", "twitter:image", "twitter:url", "apple-itunes-app"]: for tag in [
"description",
"twitter:card",
"twitter:site",
"twitter:title",
"twitter:description",
"twitter:image",
"twitter:url",
"apple-itunes-app",
]:
unwanted_tag = soup.find("meta", attrs={"name": tag}) unwanted_tag = soup.find("meta", attrs={"name": tag})
if (unwanted_tag): unwanted_tag.decompose(); if unwanted_tag:
for tag in ["og:site_name", "og:type", "og:url", "og:title", "og:description", "og:image"]: unwanted_tag.decompose()
for tag in [
"og:site_name",
"og:type",
"og:url",
"og:title",
"og:description",
"og:image",
]:
unwanted_og_tag = soup.find("meta", attrs={"property": tag}) unwanted_og_tag = soup.find("meta", attrs={"property": tag})
if (unwanted_og_tag): unwanted_og_tag.decompose(); if unwanted_og_tag:
unwanted_og_tag.decompose()
# set custom meta tags # set custom meta tags
custom_meta_tags = self.get_page_config(url).get("meta", []) custom_meta_tags = self.get_page_config(url).get("meta", [])
for custom_meta_tag in custom_meta_tags: for custom_meta_tag in custom_meta_tags:
tag = soup.new_tag('meta') tag = soup.new_tag("meta")
for attr, value in custom_meta_tag.items(): for attr, value in custom_meta_tag.items():
tag.attrs[attr] = value tag.attrs[attr] = value
log.debug(f"Adding meta tag {str(tag)}") log.debug(f"Adding meta tag {str(tag)}")
soup.head.append(tag) soup.head.append(tag)
# process images # process images
cache_images = True cache_images = True
for img in soup.findAll('img'): for img in soup.findAll("img"):
if img.has_attr('src'): if img.has_attr("src"):
if (cache_images and not 'data:image' in img['src']): if cache_images and not "data:image" in img["src"]:
img_src = img['src'] img_src = img["src"]
# if the path starts with /, it's one of notion's predefined images # if the path starts with /, it's one of notion's predefined images
if (img['src'].startswith('/')): if img["src"].startswith("/"):
img_src = "https://www.notion.so" + img['src'] img_src = "https://www.notion.so" + img["src"]
# notion's own default images urls are in a weird format, need to sanitize them # notion's own default images urls are in a weird format, need to sanitize them
# img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0] # img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0]
# if (not '.amazonaws' in img_src): # if (not '.amazonaws' in img_src):
# img_src = urllib.parse.unquote(img_src) # img_src = urllib.parse.unquote(img_src)
cached_image = self.cache_file(img_src) cached_image = self.cache_file(img_src)
img['src'] = cached_image img["src"] = cached_image
else: else:
if (img['src'].startswith('/')): if img["src"].startswith("/"):
img['src'] = "https://www.notion.so" + img['src'] img["src"] = "https://www.notion.so" + img["src"]
# process stylesheets # process stylesheets
for link in soup.findAll('link', rel="stylesheet"): for link in soup.findAll("link", rel="stylesheet"):
if link.has_attr('href') and link['href'].startswith('/'): if link.has_attr("href") and link["href"].startswith("/"):
# we don't need the vendors stylesheet # we don't need the vendors stylesheet
if ("vendors~" in link['href']): if "vendors~" in link["href"]:
continue continue
# css_file = link['href'].strip("/") # css_file = link['href'].strip("/")
cached_css_file = self.cache_file('https://www.notion.so' + link['href']) cached_css_file = self.cache_file("https://www.notion.so" + link["href"])
with open(self.dist_folder / cached_css_file, 'rb') as f: with open(self.dist_folder / cached_css_file, "rb") as f:
stylesheet = cssutils.parseString(f.read()) stylesheet = cssutils.parseString(f.read())
# open the stylesheet and check for any font-face rule, # open the stylesheet and check for any font-face rule,
for rule in stylesheet.cssRules: for rule in stylesheet.cssRules:
if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE: if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE:
# if any are found, download the font file # if any are found, download the font file
font_file = rule.style['src'].split("url(/")[-1].split(") format")[0] font_file = (
cached_font_file = self.cache_file(f'https://www.notion.so/{font_file}') rule.style["src"].split("url(/")[-1].split(") format")[0]
rule.style['src'] = f"url({str(cached_font_file)})" )
link['href'] = str(cached_css_file) cached_font_file = self.cache_file(
f"https://www.notion.so/{font_file}"
)
rule.style["src"] = f"url({str(cached_font_file)})"
link["href"] = str(cached_css_file)
# add our custom logic to all toggle blocks # add our custom logic to all toggle blocks
for toggle_block in soup.findAll('div',{'class':'notion-toggle-block'}): for toggle_block in soup.findAll("div", {"class": "notion-toggle-block"}):
toggle_id = uuid.uuid4() toggle_id = uuid.uuid4()
toggle_button = toggle_block.select_one('div[role=button]') toggle_button = toggle_block.select_one("div[role=button]")
toggle_content = toggle_block.find('div', {'class': None, 'style': ''}) toggle_content = toggle_block.find("div", {"class": None, "style": ""})
if (toggle_button and toggle_content): if toggle_button and toggle_content:
# add a custom class to the toggle button and content, plus a custom attribute # add a custom class to the toggle button and content,
# sharing a unique uiid so we can hook them up with some custom js logic later # plus a custom attribute sharing a unique uiid so
toggle_button['class'] = toggle_block.get('class', []) + ['loconotion-toggle-button'] # we can hook them up with some custom js logic later
toggle_content['class'] = toggle_content.get('class', []) + ['loconotion-toggle-content'] toggle_button["class"] = toggle_block.get("class", []) + [
toggle_content.attrs['loconotion-toggle-id'] = toggle_button.attrs['loconotion-toggle-id'] = toggle_id "loconotion-toggle-button"
]
toggle_content["class"] = toggle_content.get("class", []) + [
"loconotion-toggle-content"
]
toggle_content.attrs["loconotion-toggle-id"] = toggle_button.attrs[
"loconotion-toggle-id"
] = toggle_id
# if there are any table views in the page, add links to the title rows # if there are any table views in the page, add links to the title rows
for table_view in soup.findAll('div', {'class':'notion-table-view'}): for table_view in soup.findAll("div", {"class": "notion-table-view"}):
for table_row in table_view.findAll('div', {'class':'notion-collection-item'}): for table_row in table_view.findAll(
"div", {"class": "notion-collection-item"}
):
# for each row, hover the mouse over it to make the open button appear, # for each row, hover the mouse over it to make the open button appear,
# then grab its href and wrap the table row's name into a link # then grab its href and wrap the table row's name into a link
table_row_block_id = table_row['data-block-id'] table_row_block_id = table_row["data-block-id"]
table_row_hover_target = self.driver.find_element_by_css_selector(f"div[data-block-id='{table_row_block_id}'] > div > div") table_row_hover_target = self.driver.find_element_by_css_selector(
# need to scroll the row into view or else the open button won't visible to selenium f"div[data-block-id='{table_row_block_id}'] > div > div"
self.driver.execute_script("arguments[0].scrollIntoView();", table_row_hover_target) )
ActionChains(self.driver).move_to_element(table_row_hover_target).perform() # need to scroll the row into view or else
# the open button won't visible to selenium
self.driver.execute_script(
"arguments[0].scrollIntoView();", table_row_hover_target
)
ActionChains(self.driver).move_to_element(
table_row_hover_target
).perform()
try: try:
WebDriverWait(self.driver, 5).until(EC.visibility_of_element_located( WebDriverWait(self.driver, 5).until(
(By.CSS_SELECTOR, f"div[data-block-id='{table_row_block_id}'] > div > a"))) EC.visibility_of_element_located(
(
By.CSS_SELECTOR,
f"div[data-block-id='{table_row_block_id}'] > div > a",
)
)
)
except TimeoutException as ex: except TimeoutException as ex:
log.error(f"Timeout waiting for the 'open' button for row in table with block id {table_row_block_id}") log.error(
table_row_href = self.driver.find_element_by_css_selector(f"div[data-block-id='{table_row_block_id}'] > div > a").get_attribute('href') f"Timeout waiting for the 'open' button to appear for"
f" row in table with block id {table_row_block_id}"
)
table_row_href = self.driver.find_element_by_css_selector(
f"div[data-block-id='{table_row_block_id}'] > div > a"
).get_attribute("href")
table_row_href = table_row_href.split("notion.so")[-1] table_row_href = table_row_href.split("notion.so")[-1]
row_target_span = table_row.find("span") row_target_span = table_row.find("span")
row_link_wrapper = soup.new_tag('a', attrs={'href': table_row_href, 'style':"cursor: pointer;"}) row_link_wrapper = soup.new_tag(
"a", attrs={"href": table_row_href, "style": "cursor: pointer;"}
)
row_target_span.wrap(row_link_wrapper) row_target_span.wrap(row_link_wrapper)
# embed custom google font(s) # embed custom google font(s)
fonts_selectors = { fonts_selectors = {
"site": "div:not(.notion-code-block)", "site": "div:not(.notion-code-block)",
"navbar": ".notion-topbar div", "navbar": ".notion-topbar div",
"title": ".notion-page-block > div, .notion-collection_view_page-block > div", "title": ".notion-page-block > div, .notion-collection_view_page-block > div",
"h1" : ".notion-header-block div, notion-page-content > notion-collection_view-block > div:first-child div", "h1": ".notion-header-block div, notion-page-content >"
" notion-collection_view-block > div:first-child div",
"h2": ".notion-sub_header-block div", "h2": ".notion-sub_header-block div",
"h3": ".notion-sub_sub_header-block div", "h3": ".notion-sub_sub_header-block div",
"body": ".notion-app-inner", "body": ".notion-app-inner",
"code": ".notion-code-block *", "code": ".notion-code-block *",
} }
custom_fonts = self.get_page_config(url).get("fonts", {}) custom_fonts = self.get_page_config(url).get("fonts", {})
if (custom_fonts): if custom_fonts:
# append a stylesheet importing the google font for each unique font # append a stylesheet importing the google font for each unique font
unique_custom_fonts = set(custom_fonts.values()) unique_custom_fonts = set(custom_fonts.values())
for font in unique_custom_fonts: for font in unique_custom_fonts:
if (font): if font:
google_fonts_embed_name = font.replace(" ", "+") google_fonts_embed_name = font.replace(" ", "+")
font_href = f"https://fonts.googleapis.com/css2?family={google_fonts_embed_name}:wght@500;600;700&display=swap" font_href = f"https://fonts.googleapis.com/css2?family={google_fonts_embed_name}:wght@500;600;700&display=swap"
custom_font_stylesheet = soup.new_tag("link", rel="stylesheet", href=font_href) custom_font_stylesheet = soup.new_tag(
soup.head.append(custom_font_stylesheet); "link", rel="stylesheet", href=font_href
)
soup.head.append(custom_font_stylesheet)
# go through each custom font, and add a css rule overriding the font-family # go through each custom font, and add a css rule overriding the font-family
# to the font override stylesheet targetting the appropriate selector # to the font override stylesheet targetting the appropriate selector
font_override_stylesheet = soup.new_tag('style', type='text/css') font_override_stylesheet = soup.new_tag("style", type="text/css")
for target, custom_font in custom_fonts.items(): for target, custom_font in custom_fonts.items():
if custom_font and not target == "site": if custom_font and not target == "site":
log.debug(f"Setting {target} font-family to {custom_font}") log.debug(f"Setting {target} font-family to {custom_font}")
font_override_stylesheet.append(fonts_selectors[target] + " {font-family:" + custom_font + " !important} ") font_override_stylesheet.append(
fonts_selectors[target]
+ " {font-family:"
+ custom_font
+ " !important} "
)
site_font = custom_fonts.get("site", None) site_font = custom_fonts.get("site", None)
# process global site font last to more granular settings can override it # process global site font last to more granular settings can override it
if (site_font): if site_font:
log.debug(f"Setting global site font-family to {site_font}"), log.debug(f"Setting global site font-family to {site_font}"),
font_override_stylesheet.append(fonts_selectors["site"] + " {font-family:" + site_font + "} ") font_override_stylesheet.append(
fonts_selectors["site"] + " {font-family:" + site_font + "} "
)
# finally append the font overrides stylesheets to the page # finally append the font overrides stylesheets to the page
soup.head.append(font_override_stylesheet) soup.head.append(font_override_stylesheet)
# inject any custom elements to the page # inject any custom elements to the page
custom_injects = self.get_page_config(url).get("inject", {}) custom_injects = self.get_page_config(url).get("inject", {})
def injects_custom_tags(section): def injects_custom_tags(section):
section_custom_injects = custom_injects.get(section, {}) section_custom_injects = custom_injects.get(section, {})
for tag, elements in section_custom_injects.items(): for tag, elements in section_custom_injects.items():
@ -418,74 +517,99 @@ class Parser():
for attr, value in element.items(): for attr, value in element.items():
injected_tag[attr] = value injected_tag[attr] = value
# if the value refers to a file, copy it to the dist folder # if the value refers to a file, copy it to the dist folder
if (attr.lower() == "href" or attr.lower() == "src"): if attr.lower() == "href" or attr.lower() == "src":
log.debug(f"Copying injected file '{value}'") log.debug(f"Copying injected file '{value}'")
cached_custom_file = self.cache_file((Path.cwd() / value.strip("/"))) cached_custom_file = self.cache_file(
(Path.cwd() / value.strip("/"))
)
# destination = (self.dist_folder / source.name) # destination = (self.dist_folder / source.name)
# shutil.copyfile(source, destination) # shutil.copyfile(source, destination)
injected_tag[attr] = str(cached_custom_file) # source.name injected_tag[attr] = str(cached_custom_file) # source.name
log.debug(f"Injecting <{section}> tag: {str(injected_tag)}") log.debug(f"Injecting <{section}> tag: {str(injected_tag)}")
soup.find(section).append(injected_tag) soup.find(section).append(injected_tag)
injects_custom_tags("head") injects_custom_tags("head")
injects_custom_tags("body") injects_custom_tags("body")
# inject loconotion's custom stylesheet and script # inject loconotion's custom stylesheet and script
loconotion_custom_css = self.cache_file(Path("bundles/loconotion.css")) loconotion_custom_css = self.cache_file(Path("bundles/loconotion.css"))
custom_css = soup.new_tag("link", rel="stylesheet", href=str(loconotion_custom_css)) custom_css = soup.new_tag(
"link", rel="stylesheet", href=str(loconotion_custom_css)
)
soup.head.insert(-1, custom_css) soup.head.insert(-1, custom_css)
loconotion_custom_js = self.cache_file(Path("bundles/loconotion.js")) loconotion_custom_js = self.cache_file(Path("bundles/loconotion.js"))
custom_script = soup.new_tag("script", type="text/javascript", src=str(loconotion_custom_js)) custom_script = soup.new_tag(
"script", type="text/javascript", src=str(loconotion_custom_js)
)
soup.body.insert(-1, custom_script) soup.body.insert(-1, custom_script)
# find sub-pages and clean slugs / links # find sub-pages and clean slugs / links
sub_pages = []; sub_pages = []
for a in soup.findAll('a'): for a in soup.findAll("a"):
if a['href'].startswith('/'): if a["href"].startswith("/"):
sub_page_href = 'https://www.notion.so' + a['href'] sub_page_href = "https://www.notion.so" + a["href"]
# if the link is an anchor link, check if the page hasn't already been parsed # if the link is an anchor link,
if ("#" in sub_page_href): # check if the page hasn't already been parsed
if "#" in sub_page_href:
sub_page_href_tokens = sub_page_href.split("#") sub_page_href_tokens = sub_page_href.split("#")
sub_page_href = sub_page_href_tokens[0] sub_page_href = sub_page_href_tokens[0]
a['href'] = "#" + sub_page_href_tokens[-1] a["href"] = "#" + sub_page_href_tokens[-1]
a['class'] = a.get('class', []) + ['loconotion-anchor-link'] a["class"] = a.get("class", []) + ["loconotion-anchor-link"]
if (sub_page_href in processed_pages.keys() or sub_page_href in sub_pages): if (
log.debug(f"Original page for anchor link {sub_page_href} already parsed / pending parsing, skipping") sub_page_href in processed_pages.keys()
or sub_page_href in sub_pages
):
log.debug(
f"Original page for anchor link {sub_page_href}"
" already parsed / pending parsing, skipping"
)
continue continue
else: else:
a['href'] = self.get_page_slug(sub_page_href) if sub_page_href != index else "index.html" a["href"] = (
self.get_page_slug(sub_page_href)
if sub_page_href != index
else "index.html"
)
sub_pages.append(sub_page_href) sub_pages.append(sub_page_href)
log.debug(f"Found link to page {a['href']}") log.debug(f"Found link to page {a['href']}")
# exports the parsed page # exports the parsed page
html_str = str(soup) html_str = str(soup)
html_file = self.get_page_slug(url) if url != index else "index.html" html_file = self.get_page_slug(url) if url != index else "index.html"
if (html_file in processed_pages.values()): if html_file in processed_pages.values():
log.error(f"Found duplicate pages with slug '{html_file}' - previous one will be overwritten." + log.error(
"make sure that your notion pages names or custom slugs in the configuration files are unique") f"Found duplicate pages with slug '{html_file}' - previous one will be"
" overwritten. Make sure that your notion pages names or custom slugs"
" in the configuration files are unique"
)
log.info(f"Exporting page '{url}' as '{html_file}'") log.info(f"Exporting page '{url}' as '{html_file}'")
with open(self.dist_folder / html_file, "wb") as f: with open(self.dist_folder / html_file, "wb") as f:
f.write(html_str.encode('utf-8').strip()) f.write(html_str.encode("utf-8").strip())
processed_pages[url] = html_file processed_pages[url] = html_file
# parse sub-pages # parse sub-pages
if (sub_pages and not self.args.get("single_page", False)): if sub_pages and not self.args.get("single_page", False):
if (processed_pages): log.debug(f"Pages processed so far: {len(processed_pages)}") if processed_pages:
log.debug(f"Pages processed so far: {len(processed_pages)}")
for sub_page in sub_pages: for sub_page in sub_pages:
if not sub_page in processed_pages.keys(): if not sub_page in processed_pages.keys():
self.parse_page(sub_page, processed_pages = processed_pages, index = index) self.parse_page(
sub_page, processed_pages=processed_pages, index=index
)
# we're all done! # we're all done!
return processed_pages return processed_pages
def run(self, url): def run(self, url):
start_time = time.time() start_time = time.time()
total_processed_pages = self.parse_page(url) tot_processed_pages = self.parse_page(url)
elapsed_time = time.time() - start_time elapsed_time = time.time() - start_time
formatted_time = '{:02d}:{:02d}:{:02d}'.format(int(elapsed_time // 3600), int(elapsed_time % 3600 // 60), int(elapsed_time % 60)) formatted_time = "{:02d}:{:02d}:{:02d}".format(
log.info(f'Finished!\n\nProcessed {len(total_processed_pages)} pages in {formatted_time}') int(elapsed_time // 3600),
int(elapsed_time % 3600 // 60),
int(elapsed_time % 60),
tot_processed_pages,
)
log.info(
f"Finished!\n\nProcessed {len(tot_processed_pages)} pages in {formatted_time}"
)

315
poetry.lock generated Normal file
View File

@ -0,0 +1,315 @@
[[package]]
category = "dev"
description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
name = "appdirs"
optional = false
python-versions = "*"
version = "1.4.4"
[[package]]
category = "dev"
description = "Classes Without Boilerplate"
name = "attrs"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
version = "19.3.0"
[package.extras]
azure-pipelines = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "pytest-azurepipelines"]
dev = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "sphinx", "pre-commit"]
docs = ["sphinx", "zope.interface"]
tests = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface"]
[[package]]
category = "main"
description = "Screen-scraping library"
name = "beautifulsoup4"
optional = false
python-versions = "*"
version = "4.9.1"
[package.dependencies]
soupsieve = [">1.2", "<2.0"]
[package.extras]
html5lib = ["html5lib"]
lxml = ["lxml"]
[[package]]
category = "dev"
description = "The uncompromising code formatter."
name = "black"
optional = false
python-versions = ">=3.6"
version = "19.10b0"
[package.dependencies]
appdirs = "*"
attrs = ">=18.1.0"
click = ">=6.5"
pathspec = ">=0.6,<1"
regex = "*"
toml = ">=0.9.4"
typed-ast = ">=1.4.0"
[package.extras]
d = ["aiohttp (>=3.3.2)", "aiohttp-cors"]
[[package]]
category = "main"
description = "Python package for providing Mozilla's CA Bundle."
name = "certifi"
optional = false
python-versions = "*"
version = "2020.4.5.1"
[[package]]
category = "main"
description = "Universal encoding detector for Python 2 and 3"
name = "chardet"
optional = false
python-versions = "*"
version = "3.0.4"
[[package]]
category = "main"
description = "Automatically install chromedriver that supports the currently installed version of chrome."
name = "chromedriver-autoinstaller"
optional = false
python-versions = ">=3"
version = "0.2.0"
[[package]]
category = "dev"
description = "Composable command line interface toolkit"
name = "click"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
version = "7.1.2"
[[package]]
category = "main"
description = "Cross-platform colored terminal text."
name = "colorama"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
version = "0.4.3"
[[package]]
category = "main"
description = "A CSS Cascading Style Sheets library for Python"
name = "cssutils"
optional = false
python-versions = "*"
version = "1.0.2"
[[package]]
category = "main"
description = "Internationalized Domain Names in Applications (IDNA)"
name = "idna"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
version = "2.9"
[[package]]
category = "dev"
description = "Utility library for gitignore style pattern matching of file paths."
name = "pathspec"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
version = "0.8.0"
[[package]]
category = "dev"
description = "Alternative regular expression module, to replace re."
name = "regex"
optional = false
python-versions = "*"
version = "2020.5.14"
[[package]]
category = "main"
description = "Python HTTP for Humans."
name = "requests"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
version = "2.23.0"
[package.dependencies]
certifi = ">=2017.4.17"
chardet = ">=3.0.2,<4"
idna = ">=2.5,<3"
urllib3 = ">=1.21.1,<1.25.0 || >1.25.0,<1.25.1 || >1.25.1,<1.26"
[package.extras]
security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)"]
socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7)", "win-inet-pton"]
[[package]]
category = "main"
description = "Python bindings for Selenium"
name = "selenium"
optional = false
python-versions = "*"
version = "3.141.0"
[package.dependencies]
urllib3 = "*"
[[package]]
category = "main"
description = "A modern CSS selector implementation for Beautiful Soup."
name = "soupsieve"
optional = false
python-versions = "*"
version = "1.9.6"
[[package]]
category = "main"
description = "Python Library for Tom's Obvious, Minimal Language"
name = "toml"
optional = false
python-versions = "*"
version = "0.10.1"
[[package]]
category = "dev"
description = "a fork of Python 2 and 3 ast modules with type comment support"
name = "typed-ast"
optional = false
python-versions = "*"
version = "1.4.1"
[[package]]
category = "main"
description = "HTTP library with thread-safe connection pooling, file post, and more."
name = "urllib3"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
version = "1.25.9"
[package.extras]
brotli = ["brotlipy (>=0.6.0)"]
secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=0.14)", "ipaddress"]
socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7,<2.0)"]
[metadata]
content-hash = "ef223e0d435f4ab7f38a6499586aecdb96924ccb7bd59cd0982d0496479ad60f"
python-versions = "^3.7"
[metadata.files]
appdirs = [
{file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"},
{file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"},
]
attrs = [
{file = "attrs-19.3.0-py2.py3-none-any.whl", hash = "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c"},
{file = "attrs-19.3.0.tar.gz", hash = "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"},
]
beautifulsoup4 = [
{file = "beautifulsoup4-4.9.1-py2-none-any.whl", hash = "sha256:e718f2342e2e099b640a34ab782407b7b676f47ee272d6739e60b8ea23829f2c"},
{file = "beautifulsoup4-4.9.1-py3-none-any.whl", hash = "sha256:a6237df3c32ccfaee4fd201c8f5f9d9df619b93121d01353a64a73ce8c6ef9a8"},
{file = "beautifulsoup4-4.9.1.tar.gz", hash = "sha256:73cc4d115b96f79c7d77c1c7f7a0a8d4c57860d1041df407dd1aae7f07a77fd7"},
]
black = [
{file = "black-19.10b0-py36-none-any.whl", hash = "sha256:1b30e59be925fafc1ee4565e5e08abef6b03fe455102883820fe5ee2e4734e0b"},
{file = "black-19.10b0.tar.gz", hash = "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539"},
]
certifi = [
{file = "certifi-2020.4.5.1-py2.py3-none-any.whl", hash = "sha256:1d987a998c75633c40847cc966fcf5904906c920a7f17ef374f5aa4282abd304"},
{file = "certifi-2020.4.5.1.tar.gz", hash = "sha256:51fcb31174be6e6664c5f69e3e1691a2d72a1a12e90f872cbdb1567eb47b6519"},
]
chardet = [
{file = "chardet-3.0.4-py2.py3-none-any.whl", hash = "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"},
{file = "chardet-3.0.4.tar.gz", hash = "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"},
]
chromedriver-autoinstaller = [
{file = "chromedriver-autoinstaller-0.2.0.tar.gz", hash = "sha256:e6aadc277f2c3a1d247541eecb60bfdeabb3250c56ad9998595420840d1c7f71"},
{file = "chromedriver_autoinstaller-0.2.0-py3-none-any.whl", hash = "sha256:290a72a1e60e5d806ac0d7cc14bd6aa0746bf8e007899efca48b25eb239ea851"},
]
click = [
{file = "click-7.1.2-py2.py3-none-any.whl", hash = "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"},
{file = "click-7.1.2.tar.gz", hash = "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a"},
]
colorama = [
{file = "colorama-0.4.3-py2.py3-none-any.whl", hash = "sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff"},
{file = "colorama-0.4.3.tar.gz", hash = "sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1"},
]
cssutils = [
{file = "cssutils-1.0.2-py3-none-any.whl", hash = "sha256:c74dbe19c92f5052774eadb15136263548dd013250f1ed1027988e7fef125c8d"},
{file = "cssutils-1.0.2.tar.gz", hash = "sha256:a2fcf06467553038e98fea9cfe36af2bf14063eb147a70958cfcaa8f5786acaf"},
]
idna = [
{file = "idna-2.9-py2.py3-none-any.whl", hash = "sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa"},
{file = "idna-2.9.tar.gz", hash = "sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb"},
]
pathspec = [
{file = "pathspec-0.8.0-py2.py3-none-any.whl", hash = "sha256:7d91249d21749788d07a2d0f94147accd8f845507400749ea19c1ec9054a12b0"},
{file = "pathspec-0.8.0.tar.gz", hash = "sha256:da45173eb3a6f2a5a487efba21f050af2b41948be6ab52b6a1e3ff22bb8b7061"},
]
regex = [
{file = "regex-2020.5.14-cp27-cp27m-win32.whl", hash = "sha256:e565569fc28e3ba3e475ec344d87ed3cd8ba2d575335359749298a0899fe122e"},
{file = "regex-2020.5.14-cp27-cp27m-win_amd64.whl", hash = "sha256:d466967ac8e45244b9dfe302bbe5e3337f8dc4dec8d7d10f5e950d83b140d33a"},
{file = "regex-2020.5.14-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:27ff7325b297fb6e5ebb70d10437592433601c423f5acf86e5bc1ee2919b9561"},
{file = "regex-2020.5.14-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:ea55b80eb0d1c3f1d8d784264a6764f931e172480a2f1868f2536444c5f01e01"},
{file = "regex-2020.5.14-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:c9bce6e006fbe771a02bda468ec40ffccbf954803b470a0345ad39c603402577"},
{file = "regex-2020.5.14-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:d881c2e657c51d89f02ae4c21d9adbef76b8325fe4d5cf0e9ad62f850f3a98fd"},
{file = "regex-2020.5.14-cp36-cp36m-win32.whl", hash = "sha256:99568f00f7bf820c620f01721485cad230f3fb28f57d8fbf4a7967ec2e446994"},
{file = "regex-2020.5.14-cp36-cp36m-win_amd64.whl", hash = "sha256:70c14743320a68c5dac7fc5a0f685be63bc2024b062fe2aaccc4acc3d01b14a1"},
{file = "regex-2020.5.14-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:a7c37f048ec3920783abab99f8f4036561a174f1314302ccfa4e9ad31cb00eb4"},
{file = "regex-2020.5.14-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:89d76ce33d3266173f5be80bd4efcbd5196cafc34100fdab814f9b228dee0fa4"},
{file = "regex-2020.5.14-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:51f17abbe973c7673a61863516bdc9c0ef467407a940f39501e786a07406699c"},
{file = "regex-2020.5.14-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:ce5cc53aa9fbbf6712e92c7cf268274eaff30f6bd12a0754e8133d85a8fb0f5f"},
{file = "regex-2020.5.14-cp37-cp37m-win32.whl", hash = "sha256:8044d1c085d49673aadb3d7dc20ef5cb5b030c7a4fa253a593dda2eab3059929"},
{file = "regex-2020.5.14-cp37-cp37m-win_amd64.whl", hash = "sha256:c2062c7d470751b648f1cacc3f54460aebfc261285f14bc6da49c6943bd48bdd"},
{file = "regex-2020.5.14-cp38-cp38-manylinux1_i686.whl", hash = "sha256:329ba35d711e3428db6b45a53b1b13a0a8ba07cbbcf10bbed291a7da45f106c3"},
{file = "regex-2020.5.14-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:579ea215c81d18da550b62ff97ee187b99f1b135fd894a13451e00986a080cad"},
{file = "regex-2020.5.14-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:3a9394197664e35566242686d84dfd264c07b20f93514e2e09d3c2b3ffdf78fe"},
{file = "regex-2020.5.14-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:ce367d21f33e23a84fb83a641b3834dd7dd8e9318ad8ff677fbfae5915a239f7"},
{file = "regex-2020.5.14-cp38-cp38-win32.whl", hash = "sha256:1386e75c9d1574f6aa2e4eb5355374c8e55f9aac97e224a8a5a6abded0f9c927"},
{file = "regex-2020.5.14-cp38-cp38-win_amd64.whl", hash = "sha256:7e61be8a2900897803c293247ef87366d5df86bf701083b6c43119c7c6c99108"},
{file = "regex-2020.5.14.tar.gz", hash = "sha256:ce450ffbfec93821ab1fea94779a8440e10cf63819be6e176eb1973a6017aff5"},
]
requests = [
{file = "requests-2.23.0-py2.py3-none-any.whl", hash = "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee"},
{file = "requests-2.23.0.tar.gz", hash = "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6"},
]
selenium = [
{file = "selenium-3.141.0-py2.py3-none-any.whl", hash = "sha256:2d7131d7bc5a5b99a2d9b04aaf2612c411b03b8ca1b1ee8d3de5845a9be2cb3c"},
{file = "selenium-3.141.0.tar.gz", hash = "sha256:deaf32b60ad91a4611b98d8002757f29e6f2c2d5fcaf202e1c9ad06d6772300d"},
]
soupsieve = [
{file = "soupsieve-1.9.6-py2.py3-none-any.whl", hash = "sha256:feb1e937fa26a69e08436aad4a9037cd7e1d4c7212909502ba30701247ff8abd"},
{file = "soupsieve-1.9.6.tar.gz", hash = "sha256:7985bacc98c34923a439967c1a602dc4f1e15f923b6fcf02344184f86cc7efaa"},
]
toml = [
{file = "toml-0.10.1-py2.py3-none-any.whl", hash = "sha256:bda89d5935c2eac546d648028b9901107a595863cb36bae0c73ac804a9b4ce88"},
{file = "toml-0.10.1.tar.gz", hash = "sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f"},
]
typed-ast = [
{file = "typed_ast-1.4.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3"},
{file = "typed_ast-1.4.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb"},
{file = "typed_ast-1.4.1-cp35-cp35m-win32.whl", hash = "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919"},
{file = "typed_ast-1.4.1-cp35-cp35m-win_amd64.whl", hash = "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01"},
{file = "typed_ast-1.4.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75"},
{file = "typed_ast-1.4.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652"},
{file = "typed_ast-1.4.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7"},
{file = "typed_ast-1.4.1-cp36-cp36m-win32.whl", hash = "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1"},
{file = "typed_ast-1.4.1-cp36-cp36m-win_amd64.whl", hash = "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa"},
{file = "typed_ast-1.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614"},
{file = "typed_ast-1.4.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41"},
{file = "typed_ast-1.4.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b"},
{file = "typed_ast-1.4.1-cp37-cp37m-win32.whl", hash = "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe"},
{file = "typed_ast-1.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355"},
{file = "typed_ast-1.4.1-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6"},
{file = "typed_ast-1.4.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907"},
{file = "typed_ast-1.4.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d"},
{file = "typed_ast-1.4.1-cp38-cp38-win32.whl", hash = "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c"},
{file = "typed_ast-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4"},
{file = "typed_ast-1.4.1-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34"},
{file = "typed_ast-1.4.1.tar.gz", hash = "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b"},
]
urllib3 = [
{file = "urllib3-1.25.9-py2.py3-none-any.whl", hash = "sha256:88206b0eb87e6d677d424843ac5209e3fb9d0190d0ee169599165ec25e9d9115"},
{file = "urllib3-1.25.9.tar.gz", hash = "sha256:3018294ebefce6572a474f0604c2021e33b3fd8006ecd11d62107a5d2a963527"},
]

22
pyproject.toml Normal file
View File

@ -0,0 +1,22 @@
[tool.poetry]
name = "loconotion"
version = "0.1.0"
description = ""
authors = ["Leonardo Cavaletti <impeto.blu@gmail.com>"]
[tool.poetry.dependencies]
python = "^3.7"
beautifulsoup4 = "^4.9.1"
chromedriver-autoinstaller = "^0.2.0"
colorama = "^0.4.3"
cssutils = "^1.0.2"
requests = "^2.23.0"
selenium = "^3.141.0"
toml = "^0.10.1"
[tool.poetry.dev-dependencies]
black = "^19.10b0"
[build-system]
requires = ["poetry>=0.12"]
build-backend = "poetry.masonry.api"

View File

@ -1,12 +1,37 @@
beautifulsoup4==4.9.1 beautifulsoup4==4.9.1 \
certifi==2020.4.5.1 --hash=sha256:e718f2342e2e099b640a34ab782407b7b676f47ee272d6739e60b8ea23829f2c \
chardet==3.0.4 --hash=sha256:a6237df3c32ccfaee4fd201c8f5f9d9df619b93121d01353a64a73ce8c6ef9a8 \
chromedriver-autoinstaller==0.2.0 --hash=sha256:73cc4d115b96f79c7d77c1c7f7a0a8d4c57860d1041df407dd1aae7f07a77fd7
colorama==0.4.3 certifi==2020.4.5.1 \
cssutils==1.0.2 --hash=sha256:1d987a998c75633c40847cc966fcf5904906c920a7f17ef374f5aa4282abd304 \
idna==2.9 --hash=sha256:51fcb31174be6e6664c5f69e3e1691a2d72a1a12e90f872cbdb1567eb47b6519
requests==2.23.0 chardet==3.0.4 \
selenium==3.141.0 --hash=sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691 \
soupsieve==2.0.1 --hash=sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae
toml==0.10.1 chromedriver-autoinstaller==0.2.0 \
urllib3==1.25.9 --hash=sha256:e6aadc277f2c3a1d247541eecb60bfdeabb3250c56ad9998595420840d1c7f71 \
--hash=sha256:290a72a1e60e5d806ac0d7cc14bd6aa0746bf8e007899efca48b25eb239ea851
colorama==0.4.3 \
--hash=sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff \
--hash=sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1
cssutils==1.0.2 \
--hash=sha256:c74dbe19c92f5052774eadb15136263548dd013250f1ed1027988e7fef125c8d \
--hash=sha256:a2fcf06467553038e98fea9cfe36af2bf14063eb147a70958cfcaa8f5786acaf
idna==2.9 \
--hash=sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa \
--hash=sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb
requests==2.23.0 \
--hash=sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee \
--hash=sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6
selenium==3.141.0 \
--hash=sha256:2d7131d7bc5a5b99a2d9b04aaf2612c411b03b8ca1b1ee8d3de5845a9be2cb3c \
--hash=sha256:deaf32b60ad91a4611b98d8002757f29e6f2c2d5fcaf202e1c9ad06d6772300d
soupsieve==1.9.6 \
--hash=sha256:feb1e937fa26a69e08436aad4a9037cd7e1d4c7212909502ba30701247ff8abd \
--hash=sha256:7985bacc98c34923a439967c1a602dc4f1e15f923b6fcf02344184f86cc7efaa
toml==0.10.1 \
--hash=sha256:bda89d5935c2eac546d648028b9901107a595863cb36bae0c73ac804a9b4ce88 \
--hash=sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f
urllib3==1.25.9 \
--hash=sha256:88206b0eb87e6d677d424843ac5209e3fb9d0190d0ee169599165ec25e9d9115 \
--hash=sha256:3018294ebefce6572a474f0604c2021e33b3fd8006ecd11d62107a5d2a963527