mirror of
https://github.com/leoncvlt/loconotion.git
synced 2024-08-30 18:12:12 +00:00
Set up poetry and black formatter
This commit is contained in:
parent
bd76bc3089
commit
7e5a11cb6a
9
.gitignore
vendored
9
.gitignore
vendored
@ -108,10 +108,7 @@ dmypy.json
|
|||||||
.pyre/
|
.pyre/
|
||||||
|
|
||||||
# End of https://www.gitignore.io/api/python
|
# End of https://www.gitignore.io/api/python
|
||||||
|
.env
|
||||||
.vscode
|
.vscode
|
||||||
env
|
temp
|
||||||
dist/*
|
logs
|
||||||
test/*
|
|
||||||
logs/*
|
|
||||||
*.bat
|
|
@ -40,11 +40,13 @@ It does, but I wasn't really happy with the styling - the pages looked a bit ugl
|
|||||||
|
|
||||||
## Installation & Requirements
|
## Installation & Requirements
|
||||||
|
|
||||||
`pip install -r requirements.txt`
|
Make sure you're in your virtual environment of choiche, then run
|
||||||
|
- `poetry install --no-dev` if you have [Poetry](https://python-poetry.org/) installed
|
||||||
|
- `pip install -r requirements.txt` otherwise
|
||||||
|
|
||||||
This script uses [ChromeDriver](chromedriver.chromium.org) to automate the Google Chrome browser - therefore Google Chrome needs to be installed in order to work.
|
This script uses [ChromeDriver](chromedriver.chromium.org) to automate the Google Chrome browser - therefore Google Chrome needs to be installed in order to work.
|
||||||
|
|
||||||
The script comes bundled with the default windows chromedriver executable. On Max / Linux, download the right distribution for you from https://chromedriver.chromium.org/downloads and place the executable in this folder. Alternatively, use the `--chromedriver` argument to specify its path at runtime.
|
The script will automatically try to download and use the appropriate chromedriver distribution for your OS and Chrome version. If this doesn't work, download the right version for you from https://chromedriver.chromium.org/downloads and use the `--chromedriver` argument to specify its path at runtime.
|
||||||
|
|
||||||
## Simple Usage
|
## Simple Usage
|
||||||
|
|
||||||
|
@ -16,15 +16,37 @@ except ModuleNotFoundError as error:
|
|||||||
|
|
||||||
from notionparser import Parser
|
from notionparser import Parser
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# set up argument parser
|
# set up argument parser
|
||||||
argparser = argparse.ArgumentParser(description='Generate static websites from Notion.so pages')
|
argparser = argparse.ArgumentParser(
|
||||||
argparser.add_argument('target', help='The config file containing the site properties, or the url of the Notion.so page to generate the site from')
|
description="Generate static websites from Notion.so pages"
|
||||||
argparser.add_argument('--chromedriver', help='Use a specific chromedriver executable instead of the auto-installing one')
|
)
|
||||||
argparser.add_argument("--single-page", action="store_true", help="Only parse the first page, then stop")
|
argparser.add_argument(
|
||||||
argparser.add_argument('--clean', action='store_true', help='Delete all previously cached files for the site before generating it')
|
"target",
|
||||||
argparser.add_argument('--non-headless', action='store_true', help='Run chromedriver in non-headless mode')
|
help="The config file containing the site properties, or the url"
|
||||||
argparser.add_argument("-v", "--verbose", action="store_true", help="Increasite output log verbosity")
|
" of the Notion.so page to generate the site from",
|
||||||
|
)
|
||||||
|
argparser.add_argument(
|
||||||
|
"--chromedriver",
|
||||||
|
help="Use a specific chromedriver executable instead of the auto-installing one",
|
||||||
|
)
|
||||||
|
argparser.add_argument(
|
||||||
|
"--single-page", action="store_true", help="Only parse the first page, then stop"
|
||||||
|
)
|
||||||
|
argparser.add_argument(
|
||||||
|
"--clean",
|
||||||
|
action="store_true",
|
||||||
|
help="Delete all previously cached files for the site before generating it",
|
||||||
|
)
|
||||||
|
argparser.add_argument(
|
||||||
|
"--non-headless",
|
||||||
|
action="store_true",
|
||||||
|
help="Run chromedriver in non-headless mode",
|
||||||
|
)
|
||||||
|
argparser.add_argument(
|
||||||
|
"-v", "--verbose", action="store_true", help="Increasite output log verbosity"
|
||||||
|
)
|
||||||
args = argparser.parse_args()
|
args = argparser.parse_args()
|
||||||
|
|
||||||
# set up some pretty logs
|
# set up some pretty logs
|
||||||
@ -41,7 +63,7 @@ def main():
|
|||||||
logging.INFO: colorama.Fore.BLUE,
|
logging.INFO: colorama.Fore.BLUE,
|
||||||
logging.WARNING: colorama.Fore.YELLOW,
|
logging.WARNING: colorama.Fore.YELLOW,
|
||||||
logging.ERROR: colorama.Fore.RED,
|
logging.ERROR: colorama.Fore.RED,
|
||||||
logging.CRITICAL: colorama.Back.RED
|
logging.CRITICAL: colorama.Back.RED,
|
||||||
}
|
}
|
||||||
|
|
||||||
class ColorFormatter(logging.Formatter):
|
class ColorFormatter(logging.Formatter):
|
||||||
@ -57,11 +79,14 @@ def main():
|
|||||||
)
|
)
|
||||||
return super(ColorFormatter, self).format(new_record, *args, **kwargs)
|
return super(ColorFormatter, self).format(new_record, *args, **kwargs)
|
||||||
|
|
||||||
log_screen_handler.setFormatter(ColorFormatter(fmt='%(asctime)s %(levelname)-8s %(message)s',
|
log_screen_handler.setFormatter(
|
||||||
|
ColorFormatter(
|
||||||
|
fmt="%(asctime)s %(levelname)-8s %(message)s",
|
||||||
datefmt="{color_begin}[%H:%M:%S]{color_end}".format(
|
datefmt="{color_begin}[%H:%M:%S]{color_end}".format(
|
||||||
color_begin=colorama.Style.DIM,
|
color_begin=colorama.Style.DIM, color_end=colorama.Style.RESET_ALL
|
||||||
color_end=colorama.Style.RESET_ALL
|
),
|
||||||
)))
|
)
|
||||||
|
)
|
||||||
except ModuleNotFoundError as identifier:
|
except ModuleNotFoundError as identifier:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -70,7 +95,7 @@ def main():
|
|||||||
if urllib.parse.urlparse(args.target).scheme:
|
if urllib.parse.urlparse(args.target).scheme:
|
||||||
try:
|
try:
|
||||||
response = requests.get(args.target)
|
response = requests.get(args.target)
|
||||||
if ("notion.so" in args.target):
|
if "notion.so" in args.target:
|
||||||
log.info("Initialising parser with simple page url")
|
log.info("Initialising parser with simple page url")
|
||||||
config = {"page": args.target}
|
config = {"page": args.target}
|
||||||
Parser(config=config, args=vars(args))
|
Parser(config=config, args=vars(args))
|
||||||
@ -88,14 +113,15 @@ def main():
|
|||||||
else:
|
else:
|
||||||
log.critical(f"Config file {args.target} does not exists")
|
log.critical(f"Config file {args.target} does not exists")
|
||||||
except FileNotFoundError as e:
|
except FileNotFoundError as e:
|
||||||
log.critical(f'FileNotFoundError: {e}')
|
log.critical(f"FileNotFoundError: {e}")
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
if __name__ == "__main__":
|
||||||
try:
|
try:
|
||||||
main()
|
main()
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
log.critical('Interrupted by user')
|
log.critical("Interrupted by user")
|
||||||
try:
|
try:
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
except SystemExit:
|
except SystemExit:
|
||||||
|
@ -2,38 +2,52 @@ import logging
|
|||||||
|
|
||||||
log = logging.getLogger(f"loconotion.{__name__}")
|
log = logging.getLogger(f"loconotion.{__name__}")
|
||||||
|
|
||||||
|
|
||||||
class notion_page_loaded(object):
|
class notion_page_loaded(object):
|
||||||
"""An expectation for checking that a notion page has loaded.
|
"""An expectation for checking that a notion page has loaded."""
|
||||||
"""
|
|
||||||
def __init__(self, url):
|
def __init__(self, url):
|
||||||
self.url = url
|
self.url = url
|
||||||
|
|
||||||
def __call__(self, driver):
|
def __call__(self, driver):
|
||||||
notion_presence = len(driver.find_elements_by_class_name("notion-presence-container"))
|
notion_presence = len(
|
||||||
collection_view_block = len(driver.find_elements_by_class_name("notion-collection_view_page-block"));
|
driver.find_elements_by_class_name("notion-presence-container")
|
||||||
collection_search = len(driver.find_elements_by_class_name("collectionSearch"));
|
)
|
||||||
|
collection_view_block = len(
|
||||||
|
driver.find_elements_by_class_name("notion-collection_view_page-block")
|
||||||
|
)
|
||||||
|
collection_search = len(driver.find_elements_by_class_name("collectionSearch"))
|
||||||
# embed_ghosts = len(driver.find_elements_by_css_selector("div[embed-ghost]"));
|
# embed_ghosts = len(driver.find_elements_by_css_selector("div[embed-ghost]"));
|
||||||
log.debug(f"Waiting for page content to load (presence container: {notion_presence}, loaders: {loading_spinners} )")
|
log.debug(
|
||||||
if (notion_presence and not loading_spinners):
|
f"Waiting for page content to load"
|
||||||
|
f" (presence container: {notion_presence}, loaders: {loading_spinners} )"
|
||||||
|
)
|
||||||
|
if notion_presence and not loading_spinners:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
class toggle_block_has_opened(object):
|
class toggle_block_has_opened(object):
|
||||||
"""An expectation for checking that a notion toggle block has been opened.
|
"""An expectation for checking that a notion toggle block has been opened.
|
||||||
It does so by checking if the div hosting the content has enough children,
|
It does so by checking if the div hosting the content has enough children,
|
||||||
and the abscence of the loading spinner.
|
and the abscence of the loading spinner."""
|
||||||
"""
|
|
||||||
def __init__(self, toggle_block):
|
def __init__(self, toggle_block):
|
||||||
self.toggle_block = toggle_block
|
self.toggle_block = toggle_block
|
||||||
|
|
||||||
def __call__(self, driver):
|
def __call__(self, driver):
|
||||||
toggle_content = self.toggle_block.find_element_by_css_selector("div:not([style]")
|
toggle_content = self.toggle_block.find_element_by_css_selector("div:not([style]")
|
||||||
if (toggle_content):
|
if toggle_content:
|
||||||
content_children = len(toggle_content.find_elements_by_tag_name("div"))
|
content_children = len(toggle_content.find_elements_by_tag_name("div"))
|
||||||
is_loading = len(self.toggle_block.find_elements_by_class_name("loading-spinner"));
|
is_loading = len(
|
||||||
log.debug(f"Waiting for toggle block to load ({content_children} children so far and {is_loading} loaders)")
|
self.toggle_block.find_elements_by_class_name("loading-spinner")
|
||||||
if (content_children > 3 and not is_loading):
|
)
|
||||||
|
log.debug(
|
||||||
|
f"Waiting for toggle block to load"
|
||||||
|
f" ({content_children} children so far and {is_loading} loaders)"
|
||||||
|
)
|
||||||
|
if content_children > 3 and not is_loading:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
@ -25,6 +25,7 @@ try:
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import requests
|
import requests
|
||||||
import cssutils
|
import cssutils
|
||||||
|
|
||||||
cssutils.log.setLevel(logging.CRITICAL) # removes warning logs from cssutils
|
cssutils.log.setLevel(logging.CRITICAL) # removes warning logs from cssutils
|
||||||
except ModuleNotFoundError as error:
|
except ModuleNotFoundError as error:
|
||||||
log.critical(f"ModuleNotFoundError: {error}. have your installed the requirements?")
|
log.critical(f"ModuleNotFoundError: {error}. have your installed the requirements?")
|
||||||
@ -32,14 +33,18 @@ except ModuleNotFoundError as error:
|
|||||||
|
|
||||||
from conditions import toggle_block_has_opened
|
from conditions import toggle_block_has_opened
|
||||||
|
|
||||||
class Parser():
|
|
||||||
|
class Parser:
|
||||||
def __init__(self, config={}, args={}):
|
def __init__(self, config={}, args={}):
|
||||||
self.config = config
|
self.config = config
|
||||||
self.args = args
|
self.args = args
|
||||||
url = self.config.get("page", None)
|
url = self.config.get("page", None)
|
||||||
if not url:
|
if not url:
|
||||||
log.critical("No initial page url specified. If passing a configuration file," +
|
log.critical(
|
||||||
"make sure it contains a 'page' key with the url of the notion.so page to parse")
|
"No initial page url specified. If passing a configuration file,"
|
||||||
|
" make sure it contains a 'page' key with the url of the notion.so"
|
||||||
|
" page to parse"
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
# get the site name from the config, or make it up by cleaning the target page's slug
|
# get the site name from the config, or make it up by cleaning the target page's slug
|
||||||
@ -50,7 +55,7 @@ class Parser():
|
|||||||
log.info(f"Setting output path to '{self.dist_folder}'")
|
log.info(f"Setting output path to '{self.dist_folder}'")
|
||||||
|
|
||||||
# check if the argument to clean the dist folder was passed
|
# check if the argument to clean the dist folder was passed
|
||||||
if (self.args.get("clean", False)):
|
if self.args.get("clean", False):
|
||||||
try:
|
try:
|
||||||
shutil.rmtree(self.dist_folder)
|
shutil.rmtree(self.dist_folder)
|
||||||
log.info(f"Removing previously cached files in '{self.dist_folder}'")
|
log.info(f"Removing previously cached files in '{self.dist_folder}'")
|
||||||
@ -69,25 +74,36 @@ class Parser():
|
|||||||
site_config = self.config.get("site", {})
|
site_config = self.config.get("site", {})
|
||||||
|
|
||||||
# check if there's anything wrong with the site config
|
# check if there's anything wrong with the site config
|
||||||
if (site_config.get("slug", None)):
|
if site_config.get("slug", None):
|
||||||
log.error("'slug' parameter has no effect in the [site] table, and should only present in page tables.")
|
log.error(
|
||||||
del site_config['slug']
|
"'slug' parameter has no effect in the [site] table, "
|
||||||
|
"and should only present in page tables."
|
||||||
|
)
|
||||||
|
del site_config["slug"]
|
||||||
|
|
||||||
# find a table in the configuration file whose key contains the passed token string
|
# find a table in the configuration file whose key contains the passed token string
|
||||||
site_pages_config = self.config.get("pages", {})
|
site_pages_config = self.config.get("pages", {})
|
||||||
matching_pages_config = [value for key, value in site_pages_config.items() if key.lower() in token]
|
matching_pages_config = [
|
||||||
if (matching_pages_config):
|
value for key, value in site_pages_config.items() if key.lower() in token
|
||||||
if (len(matching_pages_config) > 1):
|
]
|
||||||
log.error(f"multiple matching page config tokens found for {token} in configuration file. Make sure pages urls / slugs are unique")
|
if matching_pages_config:
|
||||||
|
if len(matching_pages_config) > 1:
|
||||||
|
log.error(
|
||||||
|
f"multiple matching page config tokens found for {token}"
|
||||||
|
" in configuration file. Make sure pages urls / slugs are unique"
|
||||||
|
)
|
||||||
return site_config
|
return site_config
|
||||||
else:
|
else:
|
||||||
# if found, merge it on top of the global site configuration table
|
# if found, merge it on top of the global site configuration table
|
||||||
# log.debug(f"Config table found for page with token {token}")
|
# log.debug(f"Config table found for page with token {token}")
|
||||||
matching_page_config = matching_pages_config[0]
|
matching_page_config = matching_pages_config[0]
|
||||||
if (type(matching_page_config) is dict):
|
if type(matching_page_config) is dict:
|
||||||
return {**site_config, **matching_page_config}
|
return {**site_config, **matching_page_config}
|
||||||
else:
|
else:
|
||||||
log.error(f"Matching page configuration for {url} was not a dict: {matching_page_config} - something went wrong")
|
log.error(
|
||||||
|
f"Matching page configuration for {url} was not a dict:"
|
||||||
|
f" {matching_page_config} - something went wrong"
|
||||||
|
)
|
||||||
return site_config
|
return site_config
|
||||||
else:
|
else:
|
||||||
# log.debug(f"No config table found for page token {token}, using global site config table")
|
# log.debug(f"No config table found for page token {token}, using global site config table")
|
||||||
@ -102,11 +118,11 @@ class Parser():
|
|||||||
else:
|
else:
|
||||||
# if not, clean up the existing slug
|
# if not, clean up the existing slug
|
||||||
path = urllib.parse.urlparse(url).path.strip("/")
|
path = urllib.parse.urlparse(url).path.strip("/")
|
||||||
if ("-" in path and len(path.split("-")) > 1):
|
if "-" in path and len(path.split("-")) > 1:
|
||||||
# a standard notion page looks like the-page-title-[uiid]
|
# a standard notion page looks like the-page-title-[uiid]
|
||||||
# strip the uuid and keep the page title only
|
# strip the uuid and keep the page title only
|
||||||
path = "-".join(path.split("-")[:-1]).lower()
|
path = "-".join(path.split("-")[:-1]).lower()
|
||||||
elif ("?" in path):
|
elif "?" in path:
|
||||||
# database pages just have an uiid and a query param
|
# database pages just have an uiid and a query param
|
||||||
# not much to do here, just get rid of the query param
|
# not much to do here, just get rid of the query param
|
||||||
path = path.split("?")[0].lower()
|
path = path.split("?")[0].lower()
|
||||||
@ -118,19 +134,19 @@ class Parser():
|
|||||||
|
|
||||||
# if no filename specificed, generate an hashed id based the query-less url,
|
# if no filename specificed, generate an hashed id based the query-less url,
|
||||||
# so we avoid re-downloading / caching files we already have
|
# so we avoid re-downloading / caching files we already have
|
||||||
if (not filename):
|
if not filename:
|
||||||
parsed_url = urllib.parse.urlparse(url)
|
parsed_url = urllib.parse.urlparse(url)
|
||||||
queryless_url = parsed_url.netloc + parsed_url.path
|
queryless_url = parsed_url.netloc + parsed_url.path
|
||||||
query_params = urllib.parse.parse_qs(parsed_url.query)
|
query_params = urllib.parse.parse_qs(parsed_url.query)
|
||||||
# if any of the query params contains a size parameters store it in the has
|
# if any of the query params contains a size parameters store it in the has
|
||||||
# so we can download other higher-resolution versions if needed
|
# so we can download other higher-resolution versions if needed
|
||||||
if ("width" in query_params.keys()):
|
if "width" in query_params.keys():
|
||||||
queryless_url = queryless_url + f"?width={query_params['width']}"
|
queryless_url = queryless_url + f"?width={query_params['width']}"
|
||||||
filename = hashlib.sha1(str.encode(queryless_url)).hexdigest();
|
filename = hashlib.sha1(str.encode(queryless_url)).hexdigest()
|
||||||
destination = self.dist_folder / filename
|
destination = self.dist_folder / filename
|
||||||
|
|
||||||
# check if there are any files matching the filename, ignoring extension
|
# check if there are any files matching the filename, ignoring extension
|
||||||
matching_file = glob.glob(str(destination.with_suffix('.*')))
|
matching_file = glob.glob(str(destination.with_suffix(".*")))
|
||||||
if not matching_file:
|
if not matching_file:
|
||||||
# if url has a network scheme, download the file
|
# if url has a network scheme, download the file
|
||||||
if "http" in urllib.parse.urlparse(url).scheme:
|
if "http" in urllib.parse.urlparse(url).scheme:
|
||||||
@ -146,11 +162,11 @@ class Parser():
|
|||||||
# if the filename does not have an extension at this point,
|
# if the filename does not have an extension at this point,
|
||||||
# try to infer it from the url, and if not possible,
|
# try to infer it from the url, and if not possible,
|
||||||
# from the content-type header mimetype
|
# from the content-type header mimetype
|
||||||
if (not destination.suffix):
|
if not destination.suffix:
|
||||||
file_extension = Path(urllib.parse.urlparse(url).path).suffix
|
file_extension = Path(urllib.parse.urlparse(url).path).suffix
|
||||||
if (not file_extension):
|
if not file_extension:
|
||||||
content_type = response.headers.get('content-type')
|
content_type = response.headers.get("content-type")
|
||||||
if (content_type):
|
if content_type:
|
||||||
file_extension = mimetypes.guess_extension(content_type)
|
file_extension = mimetypes.guess_extension(content_type)
|
||||||
destination = destination.with_suffix(file_extension)
|
destination = destination.with_suffix(file_extension)
|
||||||
|
|
||||||
@ -177,51 +193,65 @@ class Parser():
|
|||||||
|
|
||||||
def init_chromedriver(self):
|
def init_chromedriver(self):
|
||||||
chromedriver_path = self.args.get("chromedriver")
|
chromedriver_path = self.args.get("chromedriver")
|
||||||
if (not chromedriver_path):
|
if not chromedriver_path:
|
||||||
try:
|
try:
|
||||||
chromedriver_path = chromedriver_autoinstaller.install()
|
chromedriver_path = chromedriver_autoinstaller.install()
|
||||||
except Exception as exception:
|
except Exception as exception:
|
||||||
log.critical(f"Failed to install the built-in chromedriver: {exception}\n" +
|
log.critical(
|
||||||
"download the correct version for your system at https://chromedriver.chromium.org/downloads" +
|
f"Failed to install the built-in chromedriver: {exception}\n"
|
||||||
"and use the --chromedriver argument to point to the chromedriver executable")
|
"\nDownload the correct version for your system at"
|
||||||
|
" https://chromedriver.chromium.org/downloads and use the"
|
||||||
|
" --chromedriver argument to point to the chromedriver executable"
|
||||||
|
)
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
log.info(f"Initialising chromedriver at {chromedriver_path}")
|
log.info(f"Initialising chromedriver at {chromedriver_path}")
|
||||||
logs_path = (Path.cwd() / "logs" / "webdrive.log")
|
logs_path = Path.cwd() / "logs" / "webdrive.log"
|
||||||
logs_path.parent.mkdir(parents=True, exist_ok=True)
|
logs_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
chrome_options = Options()
|
chrome_options = Options()
|
||||||
if (not self.args.get("non_headless", False)):
|
if not self.args.get("non_headless", False):
|
||||||
chrome_options.add_argument("--headless")
|
chrome_options.add_argument("--headless")
|
||||||
chrome_options.add_argument("window-size=1920,1080")
|
chrome_options.add_argument("window-size=1920,1080")
|
||||||
chrome_options.add_argument("--log-level=3");
|
chrome_options.add_argument("--log-level=3")
|
||||||
chrome_options.add_argument("--silent");
|
chrome_options.add_argument("--silent")
|
||||||
chrome_options.add_argument("--disable-logging")
|
chrome_options.add_argument("--disable-logging")
|
||||||
# removes the 'DevTools listening' log message
|
# removes the 'DevTools listening' log message
|
||||||
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
|
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
|
||||||
return webdriver.Chrome(
|
return webdriver.Chrome(
|
||||||
executable_path=str(chromedriver_path),
|
executable_path=str(chromedriver_path),
|
||||||
service_log_path=str(logs_path),
|
service_log_path=str(logs_path),
|
||||||
options=chrome_options)
|
options=chrome_options,
|
||||||
|
)
|
||||||
|
|
||||||
def parse_page(self, url, processed_pages={}, index=None):
|
def parse_page(self, url, processed_pages={}, index=None):
|
||||||
# if this is the first page being parse, set it as the index.html
|
# if this is the first page being parse, set it as the index.html
|
||||||
if (not index):
|
if not index:
|
||||||
index = url;
|
index = url
|
||||||
|
|
||||||
log.info(f"Parsing page '{url}'")
|
log.info(f"Parsing page '{url}'")
|
||||||
log.debug(f"Using page config: {self.get_page_config(url)}")
|
log.debug(f"Using page config: {self.get_page_config(url)}")
|
||||||
self.driver.get(url)
|
self.driver.get(url)
|
||||||
|
|
||||||
# if ("This content does not exist" in self.driver.page_source):
|
# if "This content does not exist" in self.driver.page_source:
|
||||||
# log.error(f"No content found in {url}. Are you sure the page is set to public?")
|
# log.error(
|
||||||
|
# f"No content found in {url}."
|
||||||
|
# " Are you sure the page is set to public?"
|
||||||
|
# )
|
||||||
# return
|
# return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# WebDriverWait(self.driver, 10).until(notion_page_loaded())
|
# WebDriverWait(self.driver, 10).until(notion_page_loaded())
|
||||||
WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'notion-presence-container')))
|
WebDriverWait(self.driver, 10).until(
|
||||||
|
EC.presence_of_element_located(
|
||||||
|
(By.CLASS_NAME, "notion-presence-container")
|
||||||
|
)
|
||||||
|
)
|
||||||
except TimeoutException as ex:
|
except TimeoutException as ex:
|
||||||
log.critical("Timeout waiting for page content to load, or no content found. Are you sure the page is set to public?")
|
log.critical(
|
||||||
|
"Timeout waiting for page content to load, or no content found."
|
||||||
|
" Are you sure the page is set to public?"
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
# cooldown to allow eventual database items to load
|
# cooldown to allow eventual database items to load
|
||||||
@ -231,185 +261,254 @@ class Parser():
|
|||||||
# function to expand all the toggle block in the page to make their content visible
|
# function to expand all the toggle block in the page to make their content visible
|
||||||
# so we can hook up our custom toggle logic afterwards
|
# so we can hook up our custom toggle logic afterwards
|
||||||
def open_toggle_blocks(exclude=[]):
|
def open_toggle_blocks(exclude=[]):
|
||||||
opened_toggles = exclude;
|
opened_toggles = exclude
|
||||||
toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block")
|
toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block")
|
||||||
log.debug(f"Opening {len(toggle_blocks)} new toggle blocks in the page")
|
log.debug(f"Opening {len(toggle_blocks)} new toggle blocks in the page")
|
||||||
for toggle_block in toggle_blocks:
|
for toggle_block in toggle_blocks:
|
||||||
if (not toggle_block in opened_toggles):
|
if not toggle_block in opened_toggles:
|
||||||
toggle_button = toggle_block.find_element_by_css_selector("div[role=button]")
|
toggle_button = toggle_block.find_element_by_css_selector(
|
||||||
|
"div[role=button]"
|
||||||
|
)
|
||||||
# check if the toggle is already open by the direction of its arrow
|
# check if the toggle is already open by the direction of its arrow
|
||||||
is_toggled = "(180deg)" in (toggle_button.find_element_by_tag_name("svg").get_attribute("style"))
|
is_toggled = "(180deg)" in (
|
||||||
if (not is_toggled):
|
toggle_button.find_element_by_tag_name("svg").get_attribute(
|
||||||
|
"style"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if not is_toggled:
|
||||||
# click on it, then wait until all elements are displayed
|
# click on it, then wait until all elements are displayed
|
||||||
toggle_button.click()
|
toggle_button.click()
|
||||||
try:
|
try:
|
||||||
WebDriverWait(self.driver, 10).until(toggle_block_has_opened(toggle_block))
|
WebDriverWait(self.driver, 10).until(
|
||||||
|
toggle_block_has_opened(toggle_block)
|
||||||
|
)
|
||||||
except TimeoutException as ex:
|
except TimeoutException as ex:
|
||||||
log.warning("Timeout waiting for toggle block to open. Likely it's already open, but doesn't hurt to check.")
|
log.warning(
|
||||||
|
"Timeout waiting for toggle block to open."
|
||||||
|
" Likely it's already open, but doesn't hurt to check."
|
||||||
|
)
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
log.error("Something went wrong with selenium while trying to open a toggle block")
|
log.error(
|
||||||
|
"Something went wrong while trying to open a toggle block"
|
||||||
|
)
|
||||||
opened_toggles.append(toggle_block)
|
opened_toggles.append(toggle_block)
|
||||||
# after all toggles have been opened, check the page again to see if
|
# after all toggles have been opened, check the page again to see if
|
||||||
# any toggle block had nested toggle blocks inside them
|
# any toggle block had nested toggle blocks inside them
|
||||||
new_toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block")
|
new_toggle_blocks = self.driver.find_elements_by_class_name(
|
||||||
if (len(new_toggle_blocks) > len(toggle_blocks)):
|
"notion-toggle-block"
|
||||||
|
)
|
||||||
|
if len(new_toggle_blocks) > len(toggle_blocks):
|
||||||
# if so, run the function again
|
# if so, run the function again
|
||||||
open_toggle_blocks(opened_toggles)
|
open_toggle_blocks(opened_toggles)
|
||||||
|
|
||||||
# open the toggle blocks in the page
|
# open the toggle blocks in the page
|
||||||
open_toggle_blocks()
|
open_toggle_blocks()
|
||||||
|
|
||||||
# creates soup from the page to start parsing
|
# creates soup from the page to start parsing
|
||||||
soup = BeautifulSoup(self.driver.page_source, "html.parser")
|
soup = BeautifulSoup(self.driver.page_source, "html.parser")
|
||||||
|
|
||||||
|
|
||||||
# remove scripts and other tags we don't want / need
|
# remove scripts and other tags we don't want / need
|
||||||
for unwanted in soup.findAll('script'):
|
for unwanted in soup.findAll("script"):
|
||||||
unwanted.decompose();
|
unwanted.decompose()
|
||||||
for intercom_frame in soup.findAll('div',{'id':'intercom-frame'}):
|
for intercom_frame in soup.findAll("div", {"id": "intercom-frame"}):
|
||||||
intercom_frame.decompose();
|
intercom_frame.decompose()
|
||||||
for intercom_div in soup.findAll('div',{'class':'intercom-lightweight-app'}):
|
for intercom_div in soup.findAll("div", {"class": "intercom-lightweight-app"}):
|
||||||
intercom_div.decompose();
|
intercom_div.decompose()
|
||||||
for overlay_div in soup.findAll('div',{'class':'notion-overlay-container'}):
|
for overlay_div in soup.findAll("div", {"class": "notion-overlay-container"}):
|
||||||
overlay_div.decompose();
|
overlay_div.decompose()
|
||||||
for vendors_css in soup.find_all("link", href=lambda x: x and 'vendors~' in x):
|
for vendors_css in soup.find_all("link", href=lambda x: x and "vendors~" in x):
|
||||||
vendors_css.decompose();
|
vendors_css.decompose()
|
||||||
|
|
||||||
|
|
||||||
# clean up the default notion meta tags
|
# clean up the default notion meta tags
|
||||||
for tag in ["description", "twitter:card", "twitter:site", "twitter:title", "twitter:description", "twitter:image", "twitter:url", "apple-itunes-app"]:
|
for tag in [
|
||||||
|
"description",
|
||||||
|
"twitter:card",
|
||||||
|
"twitter:site",
|
||||||
|
"twitter:title",
|
||||||
|
"twitter:description",
|
||||||
|
"twitter:image",
|
||||||
|
"twitter:url",
|
||||||
|
"apple-itunes-app",
|
||||||
|
]:
|
||||||
unwanted_tag = soup.find("meta", attrs={"name": tag})
|
unwanted_tag = soup.find("meta", attrs={"name": tag})
|
||||||
if (unwanted_tag): unwanted_tag.decompose();
|
if unwanted_tag:
|
||||||
for tag in ["og:site_name", "og:type", "og:url", "og:title", "og:description", "og:image"]:
|
unwanted_tag.decompose()
|
||||||
|
for tag in [
|
||||||
|
"og:site_name",
|
||||||
|
"og:type",
|
||||||
|
"og:url",
|
||||||
|
"og:title",
|
||||||
|
"og:description",
|
||||||
|
"og:image",
|
||||||
|
]:
|
||||||
unwanted_og_tag = soup.find("meta", attrs={"property": tag})
|
unwanted_og_tag = soup.find("meta", attrs={"property": tag})
|
||||||
if (unwanted_og_tag): unwanted_og_tag.decompose();
|
if unwanted_og_tag:
|
||||||
|
unwanted_og_tag.decompose()
|
||||||
|
|
||||||
# set custom meta tags
|
# set custom meta tags
|
||||||
custom_meta_tags = self.get_page_config(url).get("meta", [])
|
custom_meta_tags = self.get_page_config(url).get("meta", [])
|
||||||
for custom_meta_tag in custom_meta_tags:
|
for custom_meta_tag in custom_meta_tags:
|
||||||
tag = soup.new_tag('meta')
|
tag = soup.new_tag("meta")
|
||||||
for attr, value in custom_meta_tag.items():
|
for attr, value in custom_meta_tag.items():
|
||||||
tag.attrs[attr] = value
|
tag.attrs[attr] = value
|
||||||
log.debug(f"Adding meta tag {str(tag)}")
|
log.debug(f"Adding meta tag {str(tag)}")
|
||||||
soup.head.append(tag)
|
soup.head.append(tag)
|
||||||
|
|
||||||
|
|
||||||
# process images
|
# process images
|
||||||
cache_images = True
|
cache_images = True
|
||||||
for img in soup.findAll('img'):
|
for img in soup.findAll("img"):
|
||||||
if img.has_attr('src'):
|
if img.has_attr("src"):
|
||||||
if (cache_images and not 'data:image' in img['src']):
|
if cache_images and not "data:image" in img["src"]:
|
||||||
img_src = img['src']
|
img_src = img["src"]
|
||||||
# if the path starts with /, it's one of notion's predefined images
|
# if the path starts with /, it's one of notion's predefined images
|
||||||
if (img['src'].startswith('/')):
|
if img["src"].startswith("/"):
|
||||||
img_src = "https://www.notion.so" + img['src']
|
img_src = "https://www.notion.so" + img["src"]
|
||||||
# notion's own default images urls are in a weird format, need to sanitize them
|
# notion's own default images urls are in a weird format, need to sanitize them
|
||||||
# img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0]
|
# img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0]
|
||||||
# if (not '.amazonaws' in img_src):
|
# if (not '.amazonaws' in img_src):
|
||||||
# img_src = urllib.parse.unquote(img_src)
|
# img_src = urllib.parse.unquote(img_src)
|
||||||
|
|
||||||
cached_image = self.cache_file(img_src)
|
cached_image = self.cache_file(img_src)
|
||||||
img['src'] = cached_image
|
img["src"] = cached_image
|
||||||
else:
|
else:
|
||||||
if (img['src'].startswith('/')):
|
if img["src"].startswith("/"):
|
||||||
img['src'] = "https://www.notion.so" + img['src']
|
img["src"] = "https://www.notion.so" + img["src"]
|
||||||
|
|
||||||
|
|
||||||
# process stylesheets
|
# process stylesheets
|
||||||
for link in soup.findAll('link', rel="stylesheet"):
|
for link in soup.findAll("link", rel="stylesheet"):
|
||||||
if link.has_attr('href') and link['href'].startswith('/'):
|
if link.has_attr("href") and link["href"].startswith("/"):
|
||||||
# we don't need the vendors stylesheet
|
# we don't need the vendors stylesheet
|
||||||
if ("vendors~" in link['href']):
|
if "vendors~" in link["href"]:
|
||||||
continue
|
continue
|
||||||
# css_file = link['href'].strip("/")
|
# css_file = link['href'].strip("/")
|
||||||
cached_css_file = self.cache_file('https://www.notion.so' + link['href'])
|
cached_css_file = self.cache_file("https://www.notion.so" + link["href"])
|
||||||
with open(self.dist_folder / cached_css_file, 'rb') as f:
|
with open(self.dist_folder / cached_css_file, "rb") as f:
|
||||||
stylesheet = cssutils.parseString(f.read())
|
stylesheet = cssutils.parseString(f.read())
|
||||||
# open the stylesheet and check for any font-face rule,
|
# open the stylesheet and check for any font-face rule,
|
||||||
for rule in stylesheet.cssRules:
|
for rule in stylesheet.cssRules:
|
||||||
if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE:
|
if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE:
|
||||||
# if any are found, download the font file
|
# if any are found, download the font file
|
||||||
font_file = rule.style['src'].split("url(/")[-1].split(") format")[0]
|
font_file = (
|
||||||
cached_font_file = self.cache_file(f'https://www.notion.so/{font_file}')
|
rule.style["src"].split("url(/")[-1].split(") format")[0]
|
||||||
rule.style['src'] = f"url({str(cached_font_file)})"
|
)
|
||||||
link['href'] = str(cached_css_file)
|
cached_font_file = self.cache_file(
|
||||||
|
f"https://www.notion.so/{font_file}"
|
||||||
|
)
|
||||||
|
rule.style["src"] = f"url({str(cached_font_file)})"
|
||||||
|
link["href"] = str(cached_css_file)
|
||||||
|
|
||||||
# add our custom logic to all toggle blocks
|
# add our custom logic to all toggle blocks
|
||||||
for toggle_block in soup.findAll('div',{'class':'notion-toggle-block'}):
|
for toggle_block in soup.findAll("div", {"class": "notion-toggle-block"}):
|
||||||
toggle_id = uuid.uuid4()
|
toggle_id = uuid.uuid4()
|
||||||
toggle_button = toggle_block.select_one('div[role=button]')
|
toggle_button = toggle_block.select_one("div[role=button]")
|
||||||
toggle_content = toggle_block.find('div', {'class': None, 'style': ''})
|
toggle_content = toggle_block.find("div", {"class": None, "style": ""})
|
||||||
if (toggle_button and toggle_content):
|
if toggle_button and toggle_content:
|
||||||
# add a custom class to the toggle button and content, plus a custom attribute
|
# add a custom class to the toggle button and content,
|
||||||
# sharing a unique uiid so we can hook them up with some custom js logic later
|
# plus a custom attribute sharing a unique uiid so
|
||||||
toggle_button['class'] = toggle_block.get('class', []) + ['loconotion-toggle-button']
|
# we can hook them up with some custom js logic later
|
||||||
toggle_content['class'] = toggle_content.get('class', []) + ['loconotion-toggle-content']
|
toggle_button["class"] = toggle_block.get("class", []) + [
|
||||||
toggle_content.attrs['loconotion-toggle-id'] = toggle_button.attrs['loconotion-toggle-id'] = toggle_id
|
"loconotion-toggle-button"
|
||||||
|
]
|
||||||
|
toggle_content["class"] = toggle_content.get("class", []) + [
|
||||||
|
"loconotion-toggle-content"
|
||||||
|
]
|
||||||
|
toggle_content.attrs["loconotion-toggle-id"] = toggle_button.attrs[
|
||||||
|
"loconotion-toggle-id"
|
||||||
|
] = toggle_id
|
||||||
|
|
||||||
# if there are any table views in the page, add links to the title rows
|
# if there are any table views in the page, add links to the title rows
|
||||||
for table_view in soup.findAll('div', {'class':'notion-table-view'}):
|
for table_view in soup.findAll("div", {"class": "notion-table-view"}):
|
||||||
for table_row in table_view.findAll('div', {'class':'notion-collection-item'}):
|
for table_row in table_view.findAll(
|
||||||
|
"div", {"class": "notion-collection-item"}
|
||||||
|
):
|
||||||
# for each row, hover the mouse over it to make the open button appear,
|
# for each row, hover the mouse over it to make the open button appear,
|
||||||
# then grab its href and wrap the table row's name into a link
|
# then grab its href and wrap the table row's name into a link
|
||||||
table_row_block_id = table_row['data-block-id']
|
table_row_block_id = table_row["data-block-id"]
|
||||||
table_row_hover_target = self.driver.find_element_by_css_selector(f"div[data-block-id='{table_row_block_id}'] > div > div")
|
table_row_hover_target = self.driver.find_element_by_css_selector(
|
||||||
# need to scroll the row into view or else the open button won't visible to selenium
|
f"div[data-block-id='{table_row_block_id}'] > div > div"
|
||||||
self.driver.execute_script("arguments[0].scrollIntoView();", table_row_hover_target)
|
)
|
||||||
ActionChains(self.driver).move_to_element(table_row_hover_target).perform()
|
# need to scroll the row into view or else
|
||||||
|
# the open button won't visible to selenium
|
||||||
|
self.driver.execute_script(
|
||||||
|
"arguments[0].scrollIntoView();", table_row_hover_target
|
||||||
|
)
|
||||||
|
ActionChains(self.driver).move_to_element(
|
||||||
|
table_row_hover_target
|
||||||
|
).perform()
|
||||||
try:
|
try:
|
||||||
WebDriverWait(self.driver, 5).until(EC.visibility_of_element_located(
|
WebDriverWait(self.driver, 5).until(
|
||||||
(By.CSS_SELECTOR, f"div[data-block-id='{table_row_block_id}'] > div > a")))
|
EC.visibility_of_element_located(
|
||||||
|
(
|
||||||
|
By.CSS_SELECTOR,
|
||||||
|
f"div[data-block-id='{table_row_block_id}'] > div > a",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
except TimeoutException as ex:
|
except TimeoutException as ex:
|
||||||
log.error(f"Timeout waiting for the 'open' button for row in table with block id {table_row_block_id}")
|
log.error(
|
||||||
table_row_href = self.driver.find_element_by_css_selector(f"div[data-block-id='{table_row_block_id}'] > div > a").get_attribute('href')
|
f"Timeout waiting for the 'open' button to appear for"
|
||||||
|
f" row in table with block id {table_row_block_id}"
|
||||||
|
)
|
||||||
|
table_row_href = self.driver.find_element_by_css_selector(
|
||||||
|
f"div[data-block-id='{table_row_block_id}'] > div > a"
|
||||||
|
).get_attribute("href")
|
||||||
table_row_href = table_row_href.split("notion.so")[-1]
|
table_row_href = table_row_href.split("notion.so")[-1]
|
||||||
row_target_span = table_row.find("span")
|
row_target_span = table_row.find("span")
|
||||||
row_link_wrapper = soup.new_tag('a', attrs={'href': table_row_href, 'style':"cursor: pointer;"})
|
row_link_wrapper = soup.new_tag(
|
||||||
|
"a", attrs={"href": table_row_href, "style": "cursor: pointer;"}
|
||||||
|
)
|
||||||
row_target_span.wrap(row_link_wrapper)
|
row_target_span.wrap(row_link_wrapper)
|
||||||
|
|
||||||
|
|
||||||
# embed custom google font(s)
|
# embed custom google font(s)
|
||||||
fonts_selectors = {
|
fonts_selectors = {
|
||||||
"site": "div:not(.notion-code-block)",
|
"site": "div:not(.notion-code-block)",
|
||||||
"navbar": ".notion-topbar div",
|
"navbar": ".notion-topbar div",
|
||||||
"title": ".notion-page-block > div, .notion-collection_view_page-block > div",
|
"title": ".notion-page-block > div, .notion-collection_view_page-block > div",
|
||||||
"h1" : ".notion-header-block div, notion-page-content > notion-collection_view-block > div:first-child div",
|
"h1": ".notion-header-block div, notion-page-content >"
|
||||||
|
" notion-collection_view-block > div:first-child div",
|
||||||
"h2": ".notion-sub_header-block div",
|
"h2": ".notion-sub_header-block div",
|
||||||
"h3": ".notion-sub_sub_header-block div",
|
"h3": ".notion-sub_sub_header-block div",
|
||||||
"body": ".notion-app-inner",
|
"body": ".notion-app-inner",
|
||||||
"code": ".notion-code-block *",
|
"code": ".notion-code-block *",
|
||||||
}
|
}
|
||||||
custom_fonts = self.get_page_config(url).get("fonts", {})
|
custom_fonts = self.get_page_config(url).get("fonts", {})
|
||||||
if (custom_fonts):
|
if custom_fonts:
|
||||||
# append a stylesheet importing the google font for each unique font
|
# append a stylesheet importing the google font for each unique font
|
||||||
unique_custom_fonts = set(custom_fonts.values())
|
unique_custom_fonts = set(custom_fonts.values())
|
||||||
for font in unique_custom_fonts:
|
for font in unique_custom_fonts:
|
||||||
if (font):
|
if font:
|
||||||
google_fonts_embed_name = font.replace(" ", "+")
|
google_fonts_embed_name = font.replace(" ", "+")
|
||||||
font_href = f"https://fonts.googleapis.com/css2?family={google_fonts_embed_name}:wght@500;600;700&display=swap"
|
font_href = f"https://fonts.googleapis.com/css2?family={google_fonts_embed_name}:wght@500;600;700&display=swap"
|
||||||
custom_font_stylesheet = soup.new_tag("link", rel="stylesheet", href=font_href)
|
custom_font_stylesheet = soup.new_tag(
|
||||||
soup.head.append(custom_font_stylesheet);
|
"link", rel="stylesheet", href=font_href
|
||||||
|
)
|
||||||
|
soup.head.append(custom_font_stylesheet)
|
||||||
|
|
||||||
# go through each custom font, and add a css rule overriding the font-family
|
# go through each custom font, and add a css rule overriding the font-family
|
||||||
# to the font override stylesheet targetting the appropriate selector
|
# to the font override stylesheet targetting the appropriate selector
|
||||||
font_override_stylesheet = soup.new_tag('style', type='text/css')
|
font_override_stylesheet = soup.new_tag("style", type="text/css")
|
||||||
for target, custom_font in custom_fonts.items():
|
for target, custom_font in custom_fonts.items():
|
||||||
if custom_font and not target == "site":
|
if custom_font and not target == "site":
|
||||||
log.debug(f"Setting {target} font-family to {custom_font}")
|
log.debug(f"Setting {target} font-family to {custom_font}")
|
||||||
font_override_stylesheet.append(fonts_selectors[target] + " {font-family:" + custom_font + " !important} ")
|
font_override_stylesheet.append(
|
||||||
|
fonts_selectors[target]
|
||||||
|
+ " {font-family:"
|
||||||
|
+ custom_font
|
||||||
|
+ " !important} "
|
||||||
|
)
|
||||||
site_font = custom_fonts.get("site", None)
|
site_font = custom_fonts.get("site", None)
|
||||||
# process global site font last to more granular settings can override it
|
# process global site font last to more granular settings can override it
|
||||||
if (site_font):
|
if site_font:
|
||||||
log.debug(f"Setting global site font-family to {site_font}"),
|
log.debug(f"Setting global site font-family to {site_font}"),
|
||||||
font_override_stylesheet.append(fonts_selectors["site"] + " {font-family:" + site_font + "} ")
|
font_override_stylesheet.append(
|
||||||
|
fonts_selectors["site"] + " {font-family:" + site_font + "} "
|
||||||
|
)
|
||||||
# finally append the font overrides stylesheets to the page
|
# finally append the font overrides stylesheets to the page
|
||||||
soup.head.append(font_override_stylesheet)
|
soup.head.append(font_override_stylesheet)
|
||||||
|
|
||||||
|
|
||||||
# inject any custom elements to the page
|
# inject any custom elements to the page
|
||||||
custom_injects = self.get_page_config(url).get("inject", {})
|
custom_injects = self.get_page_config(url).get("inject", {})
|
||||||
|
|
||||||
def injects_custom_tags(section):
|
def injects_custom_tags(section):
|
||||||
section_custom_injects = custom_injects.get(section, {})
|
section_custom_injects = custom_injects.get(section, {})
|
||||||
for tag, elements in section_custom_injects.items():
|
for tag, elements in section_custom_injects.items():
|
||||||
@ -418,74 +517,99 @@ class Parser():
|
|||||||
for attr, value in element.items():
|
for attr, value in element.items():
|
||||||
injected_tag[attr] = value
|
injected_tag[attr] = value
|
||||||
# if the value refers to a file, copy it to the dist folder
|
# if the value refers to a file, copy it to the dist folder
|
||||||
if (attr.lower() == "href" or attr.lower() == "src"):
|
if attr.lower() == "href" or attr.lower() == "src":
|
||||||
log.debug(f"Copying injected file '{value}'")
|
log.debug(f"Copying injected file '{value}'")
|
||||||
cached_custom_file = self.cache_file((Path.cwd() / value.strip("/")))
|
cached_custom_file = self.cache_file(
|
||||||
|
(Path.cwd() / value.strip("/"))
|
||||||
|
)
|
||||||
# destination = (self.dist_folder / source.name)
|
# destination = (self.dist_folder / source.name)
|
||||||
# shutil.copyfile(source, destination)
|
# shutil.copyfile(source, destination)
|
||||||
injected_tag[attr] = str(cached_custom_file) # source.name
|
injected_tag[attr] = str(cached_custom_file) # source.name
|
||||||
log.debug(f"Injecting <{section}> tag: {str(injected_tag)}")
|
log.debug(f"Injecting <{section}> tag: {str(injected_tag)}")
|
||||||
soup.find(section).append(injected_tag)
|
soup.find(section).append(injected_tag)
|
||||||
|
|
||||||
injects_custom_tags("head")
|
injects_custom_tags("head")
|
||||||
injects_custom_tags("body")
|
injects_custom_tags("body")
|
||||||
|
|
||||||
|
|
||||||
# inject loconotion's custom stylesheet and script
|
# inject loconotion's custom stylesheet and script
|
||||||
loconotion_custom_css = self.cache_file(Path("bundles/loconotion.css"))
|
loconotion_custom_css = self.cache_file(Path("bundles/loconotion.css"))
|
||||||
custom_css = soup.new_tag("link", rel="stylesheet", href=str(loconotion_custom_css))
|
custom_css = soup.new_tag(
|
||||||
|
"link", rel="stylesheet", href=str(loconotion_custom_css)
|
||||||
|
)
|
||||||
soup.head.insert(-1, custom_css)
|
soup.head.insert(-1, custom_css)
|
||||||
loconotion_custom_js = self.cache_file(Path("bundles/loconotion.js"))
|
loconotion_custom_js = self.cache_file(Path("bundles/loconotion.js"))
|
||||||
custom_script = soup.new_tag("script", type="text/javascript", src=str(loconotion_custom_js))
|
custom_script = soup.new_tag(
|
||||||
|
"script", type="text/javascript", src=str(loconotion_custom_js)
|
||||||
|
)
|
||||||
soup.body.insert(-1, custom_script)
|
soup.body.insert(-1, custom_script)
|
||||||
|
|
||||||
|
|
||||||
# find sub-pages and clean slugs / links
|
# find sub-pages and clean slugs / links
|
||||||
sub_pages = [];
|
sub_pages = []
|
||||||
for a in soup.findAll('a'):
|
for a in soup.findAll("a"):
|
||||||
if a['href'].startswith('/'):
|
if a["href"].startswith("/"):
|
||||||
sub_page_href = 'https://www.notion.so' + a['href']
|
sub_page_href = "https://www.notion.so" + a["href"]
|
||||||
# if the link is an anchor link, check if the page hasn't already been parsed
|
# if the link is an anchor link,
|
||||||
if ("#" in sub_page_href):
|
# check if the page hasn't already been parsed
|
||||||
|
if "#" in sub_page_href:
|
||||||
sub_page_href_tokens = sub_page_href.split("#")
|
sub_page_href_tokens = sub_page_href.split("#")
|
||||||
sub_page_href = sub_page_href_tokens[0]
|
sub_page_href = sub_page_href_tokens[0]
|
||||||
a['href'] = "#" + sub_page_href_tokens[-1]
|
a["href"] = "#" + sub_page_href_tokens[-1]
|
||||||
a['class'] = a.get('class', []) + ['loconotion-anchor-link']
|
a["class"] = a.get("class", []) + ["loconotion-anchor-link"]
|
||||||
if (sub_page_href in processed_pages.keys() or sub_page_href in sub_pages):
|
if (
|
||||||
log.debug(f"Original page for anchor link {sub_page_href} already parsed / pending parsing, skipping")
|
sub_page_href in processed_pages.keys()
|
||||||
|
or sub_page_href in sub_pages
|
||||||
|
):
|
||||||
|
log.debug(
|
||||||
|
f"Original page for anchor link {sub_page_href}"
|
||||||
|
" already parsed / pending parsing, skipping"
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
a['href'] = self.get_page_slug(sub_page_href) if sub_page_href != index else "index.html"
|
a["href"] = (
|
||||||
|
self.get_page_slug(sub_page_href)
|
||||||
|
if sub_page_href != index
|
||||||
|
else "index.html"
|
||||||
|
)
|
||||||
sub_pages.append(sub_page_href)
|
sub_pages.append(sub_page_href)
|
||||||
log.debug(f"Found link to page {a['href']}")
|
log.debug(f"Found link to page {a['href']}")
|
||||||
|
|
||||||
|
|
||||||
# exports the parsed page
|
# exports the parsed page
|
||||||
html_str = str(soup)
|
html_str = str(soup)
|
||||||
html_file = self.get_page_slug(url) if url != index else "index.html"
|
html_file = self.get_page_slug(url) if url != index else "index.html"
|
||||||
if (html_file in processed_pages.values()):
|
if html_file in processed_pages.values():
|
||||||
log.error(f"Found duplicate pages with slug '{html_file}' - previous one will be overwritten." +
|
log.error(
|
||||||
"make sure that your notion pages names or custom slugs in the configuration files are unique")
|
f"Found duplicate pages with slug '{html_file}' - previous one will be"
|
||||||
|
" overwritten. Make sure that your notion pages names or custom slugs"
|
||||||
|
" in the configuration files are unique"
|
||||||
|
)
|
||||||
log.info(f"Exporting page '{url}' as '{html_file}'")
|
log.info(f"Exporting page '{url}' as '{html_file}'")
|
||||||
with open(self.dist_folder / html_file, "wb") as f:
|
with open(self.dist_folder / html_file, "wb") as f:
|
||||||
f.write(html_str.encode('utf-8').strip())
|
f.write(html_str.encode("utf-8").strip())
|
||||||
processed_pages[url] = html_file
|
processed_pages[url] = html_file
|
||||||
|
|
||||||
|
|
||||||
# parse sub-pages
|
# parse sub-pages
|
||||||
if (sub_pages and not self.args.get("single_page", False)):
|
if sub_pages and not self.args.get("single_page", False):
|
||||||
if (processed_pages): log.debug(f"Pages processed so far: {len(processed_pages)}")
|
if processed_pages:
|
||||||
|
log.debug(f"Pages processed so far: {len(processed_pages)}")
|
||||||
for sub_page in sub_pages:
|
for sub_page in sub_pages:
|
||||||
if not sub_page in processed_pages.keys():
|
if not sub_page in processed_pages.keys():
|
||||||
self.parse_page(sub_page, processed_pages = processed_pages, index = index)
|
self.parse_page(
|
||||||
|
sub_page, processed_pages=processed_pages, index=index
|
||||||
|
)
|
||||||
|
|
||||||
# we're all done!
|
# we're all done!
|
||||||
return processed_pages
|
return processed_pages
|
||||||
|
|
||||||
|
|
||||||
def run(self, url):
|
def run(self, url):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
total_processed_pages = self.parse_page(url)
|
tot_processed_pages = self.parse_page(url)
|
||||||
elapsed_time = time.time() - start_time
|
elapsed_time = time.time() - start_time
|
||||||
formatted_time = '{:02d}:{:02d}:{:02d}'.format(int(elapsed_time // 3600), int(elapsed_time % 3600 // 60), int(elapsed_time % 60))
|
formatted_time = "{:02d}:{:02d}:{:02d}".format(
|
||||||
log.info(f'Finished!\n\nProcessed {len(total_processed_pages)} pages in {formatted_time}')
|
int(elapsed_time // 3600),
|
||||||
|
int(elapsed_time % 3600 // 60),
|
||||||
|
int(elapsed_time % 60),
|
||||||
|
tot_processed_pages,
|
||||||
|
)
|
||||||
|
log.info(
|
||||||
|
f"Finished!\n\nProcessed {len(tot_processed_pages)} pages in {formatted_time}"
|
||||||
|
)
|
||||||
|
315
poetry.lock
generated
Normal file
315
poetry.lock
generated
Normal file
@ -0,0 +1,315 @@
|
|||||||
|
[[package]]
|
||||||
|
category = "dev"
|
||||||
|
description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
|
||||||
|
name = "appdirs"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
version = "1.4.4"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
category = "dev"
|
||||||
|
description = "Classes Without Boilerplate"
|
||||||
|
name = "attrs"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||||
|
version = "19.3.0"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
azure-pipelines = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "pytest-azurepipelines"]
|
||||||
|
dev = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "sphinx", "pre-commit"]
|
||||||
|
docs = ["sphinx", "zope.interface"]
|
||||||
|
tests = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
category = "main"
|
||||||
|
description = "Screen-scraping library"
|
||||||
|
name = "beautifulsoup4"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
version = "4.9.1"
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
soupsieve = [">1.2", "<2.0"]
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
html5lib = ["html5lib"]
|
||||||
|
lxml = ["lxml"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
category = "dev"
|
||||||
|
description = "The uncompromising code formatter."
|
||||||
|
name = "black"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.6"
|
||||||
|
version = "19.10b0"
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
appdirs = "*"
|
||||||
|
attrs = ">=18.1.0"
|
||||||
|
click = ">=6.5"
|
||||||
|
pathspec = ">=0.6,<1"
|
||||||
|
regex = "*"
|
||||||
|
toml = ">=0.9.4"
|
||||||
|
typed-ast = ">=1.4.0"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
d = ["aiohttp (>=3.3.2)", "aiohttp-cors"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
category = "main"
|
||||||
|
description = "Python package for providing Mozilla's CA Bundle."
|
||||||
|
name = "certifi"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
version = "2020.4.5.1"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
category = "main"
|
||||||
|
description = "Universal encoding detector for Python 2 and 3"
|
||||||
|
name = "chardet"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
version = "3.0.4"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
category = "main"
|
||||||
|
description = "Automatically install chromedriver that supports the currently installed version of chrome."
|
||||||
|
name = "chromedriver-autoinstaller"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3"
|
||||||
|
version = "0.2.0"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
category = "dev"
|
||||||
|
description = "Composable command line interface toolkit"
|
||||||
|
name = "click"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||||
|
version = "7.1.2"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
category = "main"
|
||||||
|
description = "Cross-platform colored terminal text."
|
||||||
|
name = "colorama"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||||
|
version = "0.4.3"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
category = "main"
|
||||||
|
description = "A CSS Cascading Style Sheets library for Python"
|
||||||
|
name = "cssutils"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
version = "1.0.2"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
category = "main"
|
||||||
|
description = "Internationalized Domain Names in Applications (IDNA)"
|
||||||
|
name = "idna"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||||
|
version = "2.9"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
category = "dev"
|
||||||
|
description = "Utility library for gitignore style pattern matching of file paths."
|
||||||
|
name = "pathspec"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||||
|
version = "0.8.0"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
category = "dev"
|
||||||
|
description = "Alternative regular expression module, to replace re."
|
||||||
|
name = "regex"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
version = "2020.5.14"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
category = "main"
|
||||||
|
description = "Python HTTP for Humans."
|
||||||
|
name = "requests"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||||
|
version = "2.23.0"
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
certifi = ">=2017.4.17"
|
||||||
|
chardet = ">=3.0.2,<4"
|
||||||
|
idna = ">=2.5,<3"
|
||||||
|
urllib3 = ">=1.21.1,<1.25.0 || >1.25.0,<1.25.1 || >1.25.1,<1.26"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)"]
|
||||||
|
socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7)", "win-inet-pton"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
category = "main"
|
||||||
|
description = "Python bindings for Selenium"
|
||||||
|
name = "selenium"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
version = "3.141.0"
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
urllib3 = "*"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
category = "main"
|
||||||
|
description = "A modern CSS selector implementation for Beautiful Soup."
|
||||||
|
name = "soupsieve"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
version = "1.9.6"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
category = "main"
|
||||||
|
description = "Python Library for Tom's Obvious, Minimal Language"
|
||||||
|
name = "toml"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
version = "0.10.1"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
category = "dev"
|
||||||
|
description = "a fork of Python 2 and 3 ast modules with type comment support"
|
||||||
|
name = "typed-ast"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
version = "1.4.1"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
category = "main"
|
||||||
|
description = "HTTP library with thread-safe connection pooling, file post, and more."
|
||||||
|
name = "urllib3"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
|
||||||
|
version = "1.25.9"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
brotli = ["brotlipy (>=0.6.0)"]
|
||||||
|
secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=0.14)", "ipaddress"]
|
||||||
|
socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7,<2.0)"]
|
||||||
|
|
||||||
|
[metadata]
|
||||||
|
content-hash = "ef223e0d435f4ab7f38a6499586aecdb96924ccb7bd59cd0982d0496479ad60f"
|
||||||
|
python-versions = "^3.7"
|
||||||
|
|
||||||
|
[metadata.files]
|
||||||
|
appdirs = [
|
||||||
|
{file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"},
|
||||||
|
{file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"},
|
||||||
|
]
|
||||||
|
attrs = [
|
||||||
|
{file = "attrs-19.3.0-py2.py3-none-any.whl", hash = "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c"},
|
||||||
|
{file = "attrs-19.3.0.tar.gz", hash = "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"},
|
||||||
|
]
|
||||||
|
beautifulsoup4 = [
|
||||||
|
{file = "beautifulsoup4-4.9.1-py2-none-any.whl", hash = "sha256:e718f2342e2e099b640a34ab782407b7b676f47ee272d6739e60b8ea23829f2c"},
|
||||||
|
{file = "beautifulsoup4-4.9.1-py3-none-any.whl", hash = "sha256:a6237df3c32ccfaee4fd201c8f5f9d9df619b93121d01353a64a73ce8c6ef9a8"},
|
||||||
|
{file = "beautifulsoup4-4.9.1.tar.gz", hash = "sha256:73cc4d115b96f79c7d77c1c7f7a0a8d4c57860d1041df407dd1aae7f07a77fd7"},
|
||||||
|
]
|
||||||
|
black = [
|
||||||
|
{file = "black-19.10b0-py36-none-any.whl", hash = "sha256:1b30e59be925fafc1ee4565e5e08abef6b03fe455102883820fe5ee2e4734e0b"},
|
||||||
|
{file = "black-19.10b0.tar.gz", hash = "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539"},
|
||||||
|
]
|
||||||
|
certifi = [
|
||||||
|
{file = "certifi-2020.4.5.1-py2.py3-none-any.whl", hash = "sha256:1d987a998c75633c40847cc966fcf5904906c920a7f17ef374f5aa4282abd304"},
|
||||||
|
{file = "certifi-2020.4.5.1.tar.gz", hash = "sha256:51fcb31174be6e6664c5f69e3e1691a2d72a1a12e90f872cbdb1567eb47b6519"},
|
||||||
|
]
|
||||||
|
chardet = [
|
||||||
|
{file = "chardet-3.0.4-py2.py3-none-any.whl", hash = "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"},
|
||||||
|
{file = "chardet-3.0.4.tar.gz", hash = "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"},
|
||||||
|
]
|
||||||
|
chromedriver-autoinstaller = [
|
||||||
|
{file = "chromedriver-autoinstaller-0.2.0.tar.gz", hash = "sha256:e6aadc277f2c3a1d247541eecb60bfdeabb3250c56ad9998595420840d1c7f71"},
|
||||||
|
{file = "chromedriver_autoinstaller-0.2.0-py3-none-any.whl", hash = "sha256:290a72a1e60e5d806ac0d7cc14bd6aa0746bf8e007899efca48b25eb239ea851"},
|
||||||
|
]
|
||||||
|
click = [
|
||||||
|
{file = "click-7.1.2-py2.py3-none-any.whl", hash = "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"},
|
||||||
|
{file = "click-7.1.2.tar.gz", hash = "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a"},
|
||||||
|
]
|
||||||
|
colorama = [
|
||||||
|
{file = "colorama-0.4.3-py2.py3-none-any.whl", hash = "sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff"},
|
||||||
|
{file = "colorama-0.4.3.tar.gz", hash = "sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1"},
|
||||||
|
]
|
||||||
|
cssutils = [
|
||||||
|
{file = "cssutils-1.0.2-py3-none-any.whl", hash = "sha256:c74dbe19c92f5052774eadb15136263548dd013250f1ed1027988e7fef125c8d"},
|
||||||
|
{file = "cssutils-1.0.2.tar.gz", hash = "sha256:a2fcf06467553038e98fea9cfe36af2bf14063eb147a70958cfcaa8f5786acaf"},
|
||||||
|
]
|
||||||
|
idna = [
|
||||||
|
{file = "idna-2.9-py2.py3-none-any.whl", hash = "sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa"},
|
||||||
|
{file = "idna-2.9.tar.gz", hash = "sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb"},
|
||||||
|
]
|
||||||
|
pathspec = [
|
||||||
|
{file = "pathspec-0.8.0-py2.py3-none-any.whl", hash = "sha256:7d91249d21749788d07a2d0f94147accd8f845507400749ea19c1ec9054a12b0"},
|
||||||
|
{file = "pathspec-0.8.0.tar.gz", hash = "sha256:da45173eb3a6f2a5a487efba21f050af2b41948be6ab52b6a1e3ff22bb8b7061"},
|
||||||
|
]
|
||||||
|
regex = [
|
||||||
|
{file = "regex-2020.5.14-cp27-cp27m-win32.whl", hash = "sha256:e565569fc28e3ba3e475ec344d87ed3cd8ba2d575335359749298a0899fe122e"},
|
||||||
|
{file = "regex-2020.5.14-cp27-cp27m-win_amd64.whl", hash = "sha256:d466967ac8e45244b9dfe302bbe5e3337f8dc4dec8d7d10f5e950d83b140d33a"},
|
||||||
|
{file = "regex-2020.5.14-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:27ff7325b297fb6e5ebb70d10437592433601c423f5acf86e5bc1ee2919b9561"},
|
||||||
|
{file = "regex-2020.5.14-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:ea55b80eb0d1c3f1d8d784264a6764f931e172480a2f1868f2536444c5f01e01"},
|
||||||
|
{file = "regex-2020.5.14-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:c9bce6e006fbe771a02bda468ec40ffccbf954803b470a0345ad39c603402577"},
|
||||||
|
{file = "regex-2020.5.14-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:d881c2e657c51d89f02ae4c21d9adbef76b8325fe4d5cf0e9ad62f850f3a98fd"},
|
||||||
|
{file = "regex-2020.5.14-cp36-cp36m-win32.whl", hash = "sha256:99568f00f7bf820c620f01721485cad230f3fb28f57d8fbf4a7967ec2e446994"},
|
||||||
|
{file = "regex-2020.5.14-cp36-cp36m-win_amd64.whl", hash = "sha256:70c14743320a68c5dac7fc5a0f685be63bc2024b062fe2aaccc4acc3d01b14a1"},
|
||||||
|
{file = "regex-2020.5.14-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:a7c37f048ec3920783abab99f8f4036561a174f1314302ccfa4e9ad31cb00eb4"},
|
||||||
|
{file = "regex-2020.5.14-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:89d76ce33d3266173f5be80bd4efcbd5196cafc34100fdab814f9b228dee0fa4"},
|
||||||
|
{file = "regex-2020.5.14-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:51f17abbe973c7673a61863516bdc9c0ef467407a940f39501e786a07406699c"},
|
||||||
|
{file = "regex-2020.5.14-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:ce5cc53aa9fbbf6712e92c7cf268274eaff30f6bd12a0754e8133d85a8fb0f5f"},
|
||||||
|
{file = "regex-2020.5.14-cp37-cp37m-win32.whl", hash = "sha256:8044d1c085d49673aadb3d7dc20ef5cb5b030c7a4fa253a593dda2eab3059929"},
|
||||||
|
{file = "regex-2020.5.14-cp37-cp37m-win_amd64.whl", hash = "sha256:c2062c7d470751b648f1cacc3f54460aebfc261285f14bc6da49c6943bd48bdd"},
|
||||||
|
{file = "regex-2020.5.14-cp38-cp38-manylinux1_i686.whl", hash = "sha256:329ba35d711e3428db6b45a53b1b13a0a8ba07cbbcf10bbed291a7da45f106c3"},
|
||||||
|
{file = "regex-2020.5.14-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:579ea215c81d18da550b62ff97ee187b99f1b135fd894a13451e00986a080cad"},
|
||||||
|
{file = "regex-2020.5.14-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:3a9394197664e35566242686d84dfd264c07b20f93514e2e09d3c2b3ffdf78fe"},
|
||||||
|
{file = "regex-2020.5.14-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:ce367d21f33e23a84fb83a641b3834dd7dd8e9318ad8ff677fbfae5915a239f7"},
|
||||||
|
{file = "regex-2020.5.14-cp38-cp38-win32.whl", hash = "sha256:1386e75c9d1574f6aa2e4eb5355374c8e55f9aac97e224a8a5a6abded0f9c927"},
|
||||||
|
{file = "regex-2020.5.14-cp38-cp38-win_amd64.whl", hash = "sha256:7e61be8a2900897803c293247ef87366d5df86bf701083b6c43119c7c6c99108"},
|
||||||
|
{file = "regex-2020.5.14.tar.gz", hash = "sha256:ce450ffbfec93821ab1fea94779a8440e10cf63819be6e176eb1973a6017aff5"},
|
||||||
|
]
|
||||||
|
requests = [
|
||||||
|
{file = "requests-2.23.0-py2.py3-none-any.whl", hash = "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee"},
|
||||||
|
{file = "requests-2.23.0.tar.gz", hash = "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6"},
|
||||||
|
]
|
||||||
|
selenium = [
|
||||||
|
{file = "selenium-3.141.0-py2.py3-none-any.whl", hash = "sha256:2d7131d7bc5a5b99a2d9b04aaf2612c411b03b8ca1b1ee8d3de5845a9be2cb3c"},
|
||||||
|
{file = "selenium-3.141.0.tar.gz", hash = "sha256:deaf32b60ad91a4611b98d8002757f29e6f2c2d5fcaf202e1c9ad06d6772300d"},
|
||||||
|
]
|
||||||
|
soupsieve = [
|
||||||
|
{file = "soupsieve-1.9.6-py2.py3-none-any.whl", hash = "sha256:feb1e937fa26a69e08436aad4a9037cd7e1d4c7212909502ba30701247ff8abd"},
|
||||||
|
{file = "soupsieve-1.9.6.tar.gz", hash = "sha256:7985bacc98c34923a439967c1a602dc4f1e15f923b6fcf02344184f86cc7efaa"},
|
||||||
|
]
|
||||||
|
toml = [
|
||||||
|
{file = "toml-0.10.1-py2.py3-none-any.whl", hash = "sha256:bda89d5935c2eac546d648028b9901107a595863cb36bae0c73ac804a9b4ce88"},
|
||||||
|
{file = "toml-0.10.1.tar.gz", hash = "sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f"},
|
||||||
|
]
|
||||||
|
typed-ast = [
|
||||||
|
{file = "typed_ast-1.4.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3"},
|
||||||
|
{file = "typed_ast-1.4.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb"},
|
||||||
|
{file = "typed_ast-1.4.1-cp35-cp35m-win32.whl", hash = "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919"},
|
||||||
|
{file = "typed_ast-1.4.1-cp35-cp35m-win_amd64.whl", hash = "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01"},
|
||||||
|
{file = "typed_ast-1.4.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75"},
|
||||||
|
{file = "typed_ast-1.4.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652"},
|
||||||
|
{file = "typed_ast-1.4.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7"},
|
||||||
|
{file = "typed_ast-1.4.1-cp36-cp36m-win32.whl", hash = "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1"},
|
||||||
|
{file = "typed_ast-1.4.1-cp36-cp36m-win_amd64.whl", hash = "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa"},
|
||||||
|
{file = "typed_ast-1.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614"},
|
||||||
|
{file = "typed_ast-1.4.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41"},
|
||||||
|
{file = "typed_ast-1.4.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b"},
|
||||||
|
{file = "typed_ast-1.4.1-cp37-cp37m-win32.whl", hash = "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe"},
|
||||||
|
{file = "typed_ast-1.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355"},
|
||||||
|
{file = "typed_ast-1.4.1-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6"},
|
||||||
|
{file = "typed_ast-1.4.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907"},
|
||||||
|
{file = "typed_ast-1.4.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d"},
|
||||||
|
{file = "typed_ast-1.4.1-cp38-cp38-win32.whl", hash = "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c"},
|
||||||
|
{file = "typed_ast-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4"},
|
||||||
|
{file = "typed_ast-1.4.1-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34"},
|
||||||
|
{file = "typed_ast-1.4.1.tar.gz", hash = "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b"},
|
||||||
|
]
|
||||||
|
urllib3 = [
|
||||||
|
{file = "urllib3-1.25.9-py2.py3-none-any.whl", hash = "sha256:88206b0eb87e6d677d424843ac5209e3fb9d0190d0ee169599165ec25e9d9115"},
|
||||||
|
{file = "urllib3-1.25.9.tar.gz", hash = "sha256:3018294ebefce6572a474f0604c2021e33b3fd8006ecd11d62107a5d2a963527"},
|
||||||
|
]
|
22
pyproject.toml
Normal file
22
pyproject.toml
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
[tool.poetry]
|
||||||
|
name = "loconotion"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = ""
|
||||||
|
authors = ["Leonardo Cavaletti <impeto.blu@gmail.com>"]
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = "^3.7"
|
||||||
|
beautifulsoup4 = "^4.9.1"
|
||||||
|
chromedriver-autoinstaller = "^0.2.0"
|
||||||
|
colorama = "^0.4.3"
|
||||||
|
cssutils = "^1.0.2"
|
||||||
|
requests = "^2.23.0"
|
||||||
|
selenium = "^3.141.0"
|
||||||
|
toml = "^0.10.1"
|
||||||
|
|
||||||
|
[tool.poetry.dev-dependencies]
|
||||||
|
black = "^19.10b0"
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry>=0.12"]
|
||||||
|
build-backend = "poetry.masonry.api"
|
@ -1,12 +1,37 @@
|
|||||||
beautifulsoup4==4.9.1
|
beautifulsoup4==4.9.1 \
|
||||||
certifi==2020.4.5.1
|
--hash=sha256:e718f2342e2e099b640a34ab782407b7b676f47ee272d6739e60b8ea23829f2c \
|
||||||
chardet==3.0.4
|
--hash=sha256:a6237df3c32ccfaee4fd201c8f5f9d9df619b93121d01353a64a73ce8c6ef9a8 \
|
||||||
chromedriver-autoinstaller==0.2.0
|
--hash=sha256:73cc4d115b96f79c7d77c1c7f7a0a8d4c57860d1041df407dd1aae7f07a77fd7
|
||||||
colorama==0.4.3
|
certifi==2020.4.5.1 \
|
||||||
cssutils==1.0.2
|
--hash=sha256:1d987a998c75633c40847cc966fcf5904906c920a7f17ef374f5aa4282abd304 \
|
||||||
idna==2.9
|
--hash=sha256:51fcb31174be6e6664c5f69e3e1691a2d72a1a12e90f872cbdb1567eb47b6519
|
||||||
requests==2.23.0
|
chardet==3.0.4 \
|
||||||
selenium==3.141.0
|
--hash=sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691 \
|
||||||
soupsieve==2.0.1
|
--hash=sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae
|
||||||
toml==0.10.1
|
chromedriver-autoinstaller==0.2.0 \
|
||||||
urllib3==1.25.9
|
--hash=sha256:e6aadc277f2c3a1d247541eecb60bfdeabb3250c56ad9998595420840d1c7f71 \
|
||||||
|
--hash=sha256:290a72a1e60e5d806ac0d7cc14bd6aa0746bf8e007899efca48b25eb239ea851
|
||||||
|
colorama==0.4.3 \
|
||||||
|
--hash=sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff \
|
||||||
|
--hash=sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1
|
||||||
|
cssutils==1.0.2 \
|
||||||
|
--hash=sha256:c74dbe19c92f5052774eadb15136263548dd013250f1ed1027988e7fef125c8d \
|
||||||
|
--hash=sha256:a2fcf06467553038e98fea9cfe36af2bf14063eb147a70958cfcaa8f5786acaf
|
||||||
|
idna==2.9 \
|
||||||
|
--hash=sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa \
|
||||||
|
--hash=sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb
|
||||||
|
requests==2.23.0 \
|
||||||
|
--hash=sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee \
|
||||||
|
--hash=sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6
|
||||||
|
selenium==3.141.0 \
|
||||||
|
--hash=sha256:2d7131d7bc5a5b99a2d9b04aaf2612c411b03b8ca1b1ee8d3de5845a9be2cb3c \
|
||||||
|
--hash=sha256:deaf32b60ad91a4611b98d8002757f29e6f2c2d5fcaf202e1c9ad06d6772300d
|
||||||
|
soupsieve==1.9.6 \
|
||||||
|
--hash=sha256:feb1e937fa26a69e08436aad4a9037cd7e1d4c7212909502ba30701247ff8abd \
|
||||||
|
--hash=sha256:7985bacc98c34923a439967c1a602dc4f1e15f923b6fcf02344184f86cc7efaa
|
||||||
|
toml==0.10.1 \
|
||||||
|
--hash=sha256:bda89d5935c2eac546d648028b9901107a595863cb36bae0c73ac804a9b4ce88 \
|
||||||
|
--hash=sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f
|
||||||
|
urllib3==1.25.9 \
|
||||||
|
--hash=sha256:88206b0eb87e6d677d424843ac5209e3fb9d0190d0ee169599165ec25e9d9115 \
|
||||||
|
--hash=sha256:3018294ebefce6572a474f0604c2021e33b3fd8006ecd11d62107a5d2a963527
|
||||||
|
Loading…
Reference in New Issue
Block a user