Fixed requirements.txt & image url unescaping for default Notion images

This commit is contained in:
Leonardo Cavaletti 2020-05-19 20:42:37 +01:00
parent c124806bdb
commit 3c111236ba
2 changed files with 12 additions and 20 deletions

View File

@ -80,7 +80,7 @@ class Parser():
# set the output folder based on the site name
self.dist_folder = Path(config.get("output", Path("dist") / site_name))
log.info(f"Setting output path to {self.dist_folder}")
log.info(f"Setting output path to '{self.dist_folder}'")
# check if the argument to clean the dist folder was passed
if (self.args.get("clean", False)):
@ -130,7 +130,7 @@ class Parser():
# first check if the url has a custom slug configured in the config file
custom_slug = self.get_page_config(url).get("slug", None)
if custom_slug:
log.debug(f"Custom slug found for url {url}: {custom_slug}")
log.debug(f"Custom slug found for url '{url}': '{custom_slug}'")
return custom_slug.strip("/") + (".html" if extension else "")
else:
# if not, clean up the existing slug
@ -220,8 +220,8 @@ class Parser():
if (not index):
index = url;
log.info(f'Parsing page {url}')
log.debug(f'Using page config: {self.get_page_config(url)}')
log.info(f"Parsing page '{url}'")
log.debug(f"Using page config: {self.get_page_config(url)}")
self.driver.get(url)
# if ("This content does not exist" in self.driver.page_source):
@ -256,7 +256,7 @@ class Parser():
try:
WebDriverWait(self.driver, 10).until(toggle_block_has_opened(toggle_block))
except TimeoutException as ex:
log.warning("Timeout waiting for toggle block to open. Likely it's already open, but doesn't hurt to check.")
log.warning("Timeout waiting for toggle block to open. Likely it's already open, but doesn't hurt to check.")
except Exception as ex:
log.error("Something went wrong with selenium while trying to open a toggle block")
opened_toggles.append(toggle_block)
@ -311,9 +311,10 @@ class Parser():
# if the path starts with /, it's one of notion's predefined images
if (img['src'].startswith('/')):
# notion's images urls are in a weird format, need to sanitize them
# notion's own default images urls are in a weird format, need to sanitize them
img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0]
# img_src = urllib.parse.unquote(img_src)
if (not '.amazonaws' in img_src):
img_src = urllib.parse.unquote(img_src)
cached_image = self.cache_file(img_src)
img['src'] = cached_image
@ -433,7 +434,7 @@ class Parser():
if (html_file in processed_pages.values()):
log.error(f"Found duplicate pages with slug '{html_file}' - previous one will be overwritten." +
"make sure that your notion pages names or custom slugs in the configuration files are unique")
log.info(f"Exporting page {url} as {html_file}")
log.info(f"Exporting page '{url}' as '{html_file}'")
with open(self.dist_folder / html_file, "wb") as f:
f.write(html_str.encode('utf-8').strip())
processed_pages[url] = html_file
@ -453,7 +454,7 @@ class Parser():
total_processed_pages = self.parse_page(url)
elapsed_time = time.time() - start_time
formatted_time = '{:02d}:{:02d}:{:02d}'.format(int(elapsed_time // 3600), int(elapsed_time % 3600 // 60), int(elapsed_time % 60))
log.info(f'Finished!\nヽ( ・‿・)ノ Processed {len(total_processed_pages)} pages in {formatted_time}')
log.info(f'Finished!\n\n\tヽ( ・‿・)ノ Processed {len(total_processed_pages)} pages in {formatted_time}')
if __name__ == '__main__':
# set up argument parser

View File

@ -1,20 +1,11 @@
ansicon==1.89.0
beautifulsoup4==4.9.0
blessed==1.17.5
beautifulsoup4==4.9.1
certifi==2020.4.5.1
chardet==3.0.4
colorama==0.4.3
commonmark==0.9.1
cssutils==1.0.2
idna==2.9
jinxed==1.0.0
pprintpp==0.4.0
Pygments==2.6.1
requests==2.23.0
selenium==3.141.0
six==1.14.0
soupsieve==2.0
soupsieve==2.0.1
toml==0.10.1
typing-extensions==3.7.4.2
urllib3==1.25.9
wcwidth==0.1.9