Improved files downloading / caching

This commit is contained in:
Leonardo Cavaletti 2020-05-18 19:40:16 +01:00
parent 0115483f04
commit 71c44a8750
3 changed files with 133 additions and 88 deletions

View File

@ -57,4 +57,9 @@ div[role="button"]:not(.notion-record-icon):hover {
.notion-column_list-block > div > * { .notion-column_list-block > div > * {
width: unset !important; width: unset !important;
} }
/* Stops text from overflowing past max-width on bookmark blocks; */
.notion-bookmark-block div {
white-space: unset !important;
}
} }

View File

@ -26,34 +26,39 @@ for (let i = 0; i < toggleButtons.length; i++) {
} }
} }
const pendingIframes = document.getElementsByClassName("loconotion-iframe-target"); const pendingIframes = document.getElementsByTagName("iframe");
for (let i = 0; i < pendingIframes.length; i++) { for (let i = 0; i < pendingIframes.length; i++) {
const pendingIframe = pendingIframes.item(i); pendingIframes.item(i).parentElement.style.opacity = 1;
const iframeSrc = pendingIframe.getAttribute("loconotion-iframe-src");
const iframe = document.createElement("iframe");
pendingIframe.style.opacity = 0;
iframe.onload = () => {
pendingIframe.style.opacity = 1;
};
iframe.style.width = "100%";
iframe.style.height = "100%";
iframe.style.position = "absolute";
iframe.style.left = 0;
iframe.style.top = 0;
iframe.style.pointerEvents = "auto";
iframe.setAttribute("src", iframeSrc);
iframe.setAttribute("frameborder", "0");
iframe.setAttribute(
"sandbox",
"allow-scripts allow-popups allow-top-navigation-by-user-activation allow-forms allow-same-origin"
);
pendingIframe.appendChild(iframe);
} }
// const pendingIframes = document.getElementsByClassName("loconotion-iframe-target");
// for (let i = 0; i < pendingIframes.length; i++) {
// const pendingIframe = pendingIframes.item(i);
// const iframeSrc = pendingIframe.getAttribute("loconotion-iframe-src");
// const iframe = document.createElement("iframe");
// pendingIframe.style.opacity = 0;
// iframe.onload = () => {
// pendingIframe.style.opacity = 1;
// };
// iframe.style.width = "100%";
// iframe.style.height = "100%";
// iframe.style.position = "absolute";
// iframe.style.left = 0;
// iframe.style.top = 0;
// iframe.style.pointerEvents = "auto";
// iframe.setAttribute("src", iframeSrc);
// iframe.setAttribute("frameborder", "0");
// iframe.setAttribute(
// "sandbox",
// "allow-scripts allow-popups allow-top-navigation-by-user-activation allow-forms allow-same-origin"
// );
// pendingIframe.appendChild(iframe);
// }
const collectionSearchBoxes = document.getElementsByClassName("collectionSearch"); const collectionSearchBoxes = document.getElementsByClassName("collectionSearch");
for (let i = 0; i < collectionSearchBoxes.length; i++) { for (let i = 0; i < collectionSearchBoxes.length; i++) {
const collectionSearchBox = collectionSearchBoxes.item(i).parentElement; const collectionSearchBox = collectionSearchBoxes.item(i).parentElement;

View File

@ -6,6 +6,8 @@ import time
import uuid import uuid
import logging import logging
import re import re
import glob
import mimetypes
from rich.logging import RichHandler from rich.logging import RichHandler
from rich.progress import Progress from rich.progress import Progress
import urllib.parse import urllib.parse
@ -135,22 +137,56 @@ class Parser():
return path + (".html" if extension else "") return path + (".html" if extension else "")
def cache_file(self, url, filename = None): def cache_file(self, url, filename = None):
if (not filename): filename = url # stringify the url in case it's a Path object
url = str(url)
# if no filename specificed, generate an hashed id based the url,
# so we avoid re-downloading / caching files we already have
if (not filename):
filename = hashlib.sha1(str.encode(url)).hexdigest();
destination = self.dist_folder / filename destination = self.dist_folder / filename
if not Path(destination).is_file():
# Disabling proxy speeds up requests time # check if there are any files matching the filename, ignoring extension
# https://stackoverflow.com/questions/45783655/first-https-request-takes-much-more-time-than-the-rest matching_file = glob.glob(str(destination.with_suffix('.*')))
# https://stackoverflow.com/questions/28521535/requests-how-to-disable-bypass-proxy if not matching_file:
session = requests.Session() # if url has a network scheme, download the file
session.trust_env = False if "http" in urllib.parse.urlparse(url).scheme:
log.info(f"Downloading '{url}' to '{destination}'") # Disabling proxy speeds up requests time
response = session.get(url) # https://stackoverflow.com/questions/45783655/first-https-request-takes-much-more-time-than-the-rest
Path(destination).parent.mkdir(parents=True, exist_ok=True) # https://stackoverflow.com/questions/28521535/requests-how-to-disable-bypass-proxy
with open(destination, "wb") as f: session = requests.Session()
f.write(response.content) session.trust_env = False
log.info(f"Downloading '{url}'")
response = session.get(url)
# if the filename does not have an extension at this point,
# try to infer it from the url, and if not possible,
# from the content-type header mimetype
if (not destination.suffix):
file_extension = Path(urllib.parse.urlparse(url).path).suffix
if (not file_extension):
content_type = response.headers['content-type']
file_extension = mimetypes.guess_extension(content_type)
destination = destination.with_suffix(file_extension)
Path(destination).parent.mkdir(parents=True, exist_ok=True)
with open(destination, "wb") as f:
f.write(response.content)
return destination.relative_to(self.dist_folder)
# if not, check if it's a local file, and copy it to the dist folder
else:
if Path(url).is_file():
log.debug(f"Caching local file '{url}'")
destination = destination.with_suffix(Path(url).suffix)
shutil.copyfile(url, destination)
return destination.relative_to(self.dist_folder)
# if we already have a matching cached file, just return its relative path
else: else:
log.debug(f"File '{destination}' was already downloaded") cached_file = Path(matching_file[0]).relative_to(self.dist_folder)
return destination log.debug(f"'{url}' was already downloaded")
return cached_file
# if all fails, return the original url
return url
def init_chromedriver(self): def init_chromedriver(self):
log.info("Initialising chrome driver") log.info("Initialising chrome driver")
@ -167,7 +203,7 @@ class Parser():
service_log_path=str(Path.cwd() / "webdrive.log"), service_log_path=str(Path.cwd() / "webdrive.log"),
options=chrome_options) options=chrome_options)
def parse_page(self, url, processed_pages, index = None): def parse_page(self, url, processed_pages = [], index = None):
# if this is the first page being parse, set it as the index.html # if this is the first page being parse, set it as the index.html
if (not index): if (not index):
index = url; index = url;
@ -187,10 +223,12 @@ class Parser():
log.critical("Timeout waiting for page content to load, or no content found. Are you sure the page is set to public?") log.critical("Timeout waiting for page content to load, or no content found. Are you sure the page is set to public?")
return return
# cooldown to allow eventual database items to load
# TODO: figure out a way to detect they loaded
time.sleep(2) time.sleep(2)
# expands all the toggle block in the page to make their content visible # function to expand all the toggle block in the page to make their content visible
# we hook up our custom toggle logic afterwards # so we can hook up our custom toggle logic afterwards
def open_toggle_blocks(exclude = []): def open_toggle_blocks(exclude = []):
opened_toggles = exclude; opened_toggles = exclude;
toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block") toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block")
@ -215,17 +253,23 @@ class Parser():
# if so, run the function again # if so, run the function again
open_toggle_blocks(opened_toggles) open_toggle_blocks(opened_toggles)
# open those toggle blocks!
open_toggle_blocks() open_toggle_blocks()
# creates soup from the page to start parsing # creates soup from the page to start parsing
soup = BeautifulSoup(self.driver.page_source, "html.parser") soup = BeautifulSoup(self.driver.page_source, "html.parser")
# process eventual embedded iframes # remove scripts and other tags we don't want / need
for embed in soup.select('div[embed-ghost]'): for unwanted in soup.findAll('script'):
iframe = embed.find('iframe'); unwanted.decompose();
iframe_parent = iframe.parent for intercom_frame in soup.findAll('div',{'id':'intercom-frame'}):
iframe_parent['class'] = iframe_parent.get('class', []) + ['loconotion-iframe-target'] intercom_frame.decompose();
iframe_parent['loconotion-iframe-src'] = iframe['src'] for intercom_div in soup.findAll('div',{'class':'intercom-lightweight-app'}):
intercom_div.decompose();
for overlay_div in soup.findAll('div',{'class':'notion-overlay-container'}):
overlay_div.decompose();
for vendors_css in soup.find_all("link", href=lambda x: x and 'vendors~' in x):
vendors_css.decompose();
# clean up the default notion meta tags # clean up the default notion meta tags
for tag in ["description", "twitter:card", "twitter:site", "twitter:title", "twitter:description", "twitter:image", "twitter:url", "apple-itunes-app"]: for tag in ["description", "twitter:card", "twitter:site", "twitter:title", "twitter:description", "twitter:image", "twitter:url", "apple-itunes-app"]:
@ -257,16 +301,8 @@ class Parser():
img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0] img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0]
img_src = urllib.parse.unquote(img_src) img_src = urllib.parse.unquote(img_src)
# generate an hashed id for the image filename based the url, cached_image = self.cache_file(img_src)
# so we avoid re-downloading images we have already downloaded, img['src'] = cached_image
# and figure out the filename from the url (I know, just this once)
img_extension = Path(urllib.parse.urlparse(img_src).path).suffix
#TODO: unsplash images don't have an extension in the url (they work though)
img_name = hashlib.sha1(str.encode(img_src)).hexdigest();
img_file = img_name + img_extension
self.cache_file(img_src, img_file)
img['src'] = img_file
else: else:
if (img['src'].startswith('/')): if (img['src'].startswith('/')):
img['src'] = "https://www.notion.so" + img['src'] img['src'] = "https://www.notion.so" + img['src']
@ -277,27 +313,18 @@ class Parser():
# we don't need the vendors stylesheet # we don't need the vendors stylesheet
if ("vendors~" in link['href']): if ("vendors~" in link['href']):
continue continue
css_file = link['href'].strip("/") # css_file = link['href'].strip("/")
saved_css_file = self.cache_file('https://www.notion.so' + link['href'], css_file) cached_css_file = self.cache_file('https://www.notion.so' + link['href'])
with open(saved_css_file, 'rb') as f: with open(self.dist_folder / cached_css_file, 'rb') as f:
stylesheet = cssutils.parseString(f.read()) stylesheet = cssutils.parseString(f.read())
# open the stylesheet and check for any font-face rule, # open the stylesheet and check for any font-face rule,
for rule in stylesheet.cssRules: for rule in stylesheet.cssRules:
if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE: if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE:
# if any are found, download the font file # if any are found, download the font file
font_file = rule.style['src'].split("url(/")[-1].split(") format")[0] font_file = rule.style['src'].split("url(/")[-1].split(") format")[0]
self.cache_file(f'https://www.notion.so/{font_file}', font_file) cached_font_file = self.cache_file(f'https://www.notion.so/{font_file}')
link['href'] = css_file rule.style['src'] = f"url({str(cached_font_file)})"
link['href'] = str(cached_css_file)
# remove scripts and other tags we don't want / need
for unwanted in soup.findAll(['script', 'iframe']):
unwanted.decompose();
for intercom_div in soup.findAll('div',{'class':'intercom-lightweight-app'}):
intercom_div.decompose();
for overlay_div in soup.findAll('div',{'class':'notion-overlay-container'}):
overlay_div.decompose();
for vendors_css in soup.find_all("link", href=lambda x: x and 'vendors~' in x):
vendors_css.decompose();
# add our custom logic to all toggle blocks # add our custom logic to all toggle blocks
for toggle_block in soup.findAll('div',{'class':'notion-toggle-block'}): for toggle_block in soup.findAll('div',{'class':'notion-toggle-block'}):
@ -360,19 +387,21 @@ class Parser():
# if the value refers to a file, copy it to the dist folder # if the value refers to a file, copy it to the dist folder
if (attr.lower() == "href" or attr.lower() == "src"): if (attr.lower() == "href" or attr.lower() == "src"):
log.debug(f"Copying injected file '{value}'") log.debug(f"Copying injected file '{value}'")
source = (Path.cwd() / value.strip("/")) cached_custom_file = self.cache_file((Path.cwd() / value.strip("/")))
destination = (self.dist_folder / source.name) # destination = (self.dist_folder / source.name)
shutil.copyfile(source, destination) # shutil.copyfile(source, destination)
injected_tag[attr] = source.name injected_tag[attr] = str(cached_custom_file) #source.name
log.debug(f"Injecting <{section}> tag: {str(injected_tag)}") log.debug(f"Injecting <{section}> tag: {str(injected_tag)}")
soup.find(section).append(injected_tag) soup.find(section).append(injected_tag)
injects_custom_tags("head") injects_custom_tags("head")
injects_custom_tags("body") injects_custom_tags("body")
# inject loconotion's custom stylesheet and script # inject loconotion's custom stylesheet and script
custom_css = soup.new_tag("link", rel="stylesheet", href="loconotion.css") loconotion_custom_css = self.cache_file("loconotion.css")
custom_css = soup.new_tag("link", rel="stylesheet", href=str(loconotion_custom_css))
soup.head.insert(-1, custom_css) soup.head.insert(-1, custom_css)
custom_script = soup.new_tag("script", type="text/javascript", src="loconotion.js") loconotion_custom_js = self.cache_file("loconotion.js")
custom_script = soup.new_tag("script", type="text/javascript", src=str(loconotion_custom_js))
soup.body.insert(-1, custom_script) soup.body.insert(-1, custom_script)
# find sub-pages and clean slugs / links # find sub-pages and clean slugs / links
@ -393,17 +422,23 @@ class Parser():
processed_pages.append(url) processed_pages.append(url)
# parse sub-pages # parse sub-pages
for sub_page in sub_pages: if (sub_pages):
if not sub_page in processed_pages: if (processed_pages): log.debug(f"Pages processed so far: {processed_pages}")
self.parse_page(sub_page, processed_pages, index) for sub_page in sub_pages:
if not sub_page in processed_pages:
self.parse_page(sub_page, processed_pages, index)
#we're all done!
return processed_pages
def run(self, url): def run(self, url):
processed_pages = [] start_time = time.time()
self.parse_page(url, processed_pages)
# copy custom assets to dist folder total_processed_pages = self.parse_page(url)
shutil.copyfile("loconotion.css", self.dist_folder / "loconotion.css");
shutil.copyfile("loconotion.js", self.dist_folder / "loconotion.js"); elapsed_time = time.time() - start_time
formatted_time = '{:02d}:{:02d}:{:02d}'.format(int(elapsed_time // 3600), int(elapsed_time % 3600 // 60), int(elapsed_time % 60))
log.info(f'Finished!\nヽ( ・‿・)ノ Processed {len(total_processed_pages)} pages in {formatted_time}')
parser = argparse.ArgumentParser(description='Generate static websites from Notion.so pages') parser = argparse.ArgumentParser(description='Generate static websites from Notion.so pages')
parser.add_argument('target', help='The config file containing the site properties, or the url of the Notion.so page to generate the site from') parser.add_argument('target', help='The config file containing the site properties, or the url of the Notion.so page to generate the site from')
@ -425,7 +460,7 @@ if __name__ == '__main__':
if Path(args.target).is_file(): if Path(args.target).is_file():
with open(args.target) as f: with open(args.target) as f:
parsed_config = toml.loads(f.read()) parsed_config = toml.loads(f.read())
log.info("Initialising parser with configuration file") log.info(f"Initialising parser with configuration file: {parsed_config}")
Parser(parsed_config) Parser(parsed_config)
else: else:
log.critical(f"Config file {args.target} does not exists") log.critical(f"Config file {args.target} does not exists")