From 71c44a87504530362ab62b96723abc1d7faadc14 Mon Sep 17 00:00:00 2001 From: Leonardo Cavaletti Date: Mon, 18 May 2020 19:40:16 +0100 Subject: [PATCH] Improved files downloading / caching --- loconotion.css | 5 ++ loconotion.js | 55 +++++++++-------- loconotion.py | 161 ++++++++++++++++++++++++++++++------------------- 3 files changed, 133 insertions(+), 88 deletions(-) diff --git a/loconotion.css b/loconotion.css index 0235963..8ab6a20 100644 --- a/loconotion.css +++ b/loconotion.css @@ -57,4 +57,9 @@ div[role="button"]:not(.notion-record-icon):hover { .notion-column_list-block > div > * { width: unset !important; } + + /* Stops text from overflowing past max-width on bookmark blocks; */ + .notion-bookmark-block div { + white-space: unset !important; + } } diff --git a/loconotion.js b/loconotion.js index 018879f..c37581e 100644 --- a/loconotion.js +++ b/loconotion.js @@ -26,34 +26,39 @@ for (let i = 0; i < toggleButtons.length; i++) { } } -const pendingIframes = document.getElementsByClassName("loconotion-iframe-target"); +const pendingIframes = document.getElementsByTagName("iframe"); for (let i = 0; i < pendingIframes.length; i++) { - const pendingIframe = pendingIframes.item(i); - const iframeSrc = pendingIframe.getAttribute("loconotion-iframe-src"); - const iframe = document.createElement("iframe"); - - pendingIframe.style.opacity = 0; - iframe.onload = () => { - pendingIframe.style.opacity = 1; - }; - - iframe.style.width = "100%"; - iframe.style.height = "100%"; - iframe.style.position = "absolute"; - iframe.style.left = 0; - iframe.style.top = 0; - iframe.style.pointerEvents = "auto"; - - iframe.setAttribute("src", iframeSrc); - iframe.setAttribute("frameborder", "0"); - iframe.setAttribute( - "sandbox", - "allow-scripts allow-popups allow-top-navigation-by-user-activation allow-forms allow-same-origin" - ); - - pendingIframe.appendChild(iframe); + pendingIframes.item(i).parentElement.style.opacity = 1; } +// const pendingIframes = document.getElementsByClassName("loconotion-iframe-target"); +// for (let i = 0; i < pendingIframes.length; i++) { +// const pendingIframe = pendingIframes.item(i); +// const iframeSrc = pendingIframe.getAttribute("loconotion-iframe-src"); +// const iframe = document.createElement("iframe"); + +// pendingIframe.style.opacity = 0; +// iframe.onload = () => { +// pendingIframe.style.opacity = 1; +// }; + +// iframe.style.width = "100%"; +// iframe.style.height = "100%"; +// iframe.style.position = "absolute"; +// iframe.style.left = 0; +// iframe.style.top = 0; +// iframe.style.pointerEvents = "auto"; + +// iframe.setAttribute("src", iframeSrc); +// iframe.setAttribute("frameborder", "0"); +// iframe.setAttribute( +// "sandbox", +// "allow-scripts allow-popups allow-top-navigation-by-user-activation allow-forms allow-same-origin" +// ); + +// pendingIframe.appendChild(iframe); +// } + const collectionSearchBoxes = document.getElementsByClassName("collectionSearch"); for (let i = 0; i < collectionSearchBoxes.length; i++) { const collectionSearchBox = collectionSearchBoxes.item(i).parentElement; diff --git a/loconotion.py b/loconotion.py index 45868f2..319eece 100644 --- a/loconotion.py +++ b/loconotion.py @@ -6,6 +6,8 @@ import time import uuid import logging import re +import glob +import mimetypes from rich.logging import RichHandler from rich.progress import Progress import urllib.parse @@ -135,22 +137,56 @@ class Parser(): return path + (".html" if extension else "") def cache_file(self, url, filename = None): - if (not filename): filename = url + # stringify the url in case it's a Path object + url = str(url) + + # if no filename specificed, generate an hashed id based the url, + # so we avoid re-downloading / caching files we already have + if (not filename): + filename = hashlib.sha1(str.encode(url)).hexdigest(); destination = self.dist_folder / filename - if not Path(destination).is_file(): - # Disabling proxy speeds up requests time - # https://stackoverflow.com/questions/45783655/first-https-request-takes-much-more-time-than-the-rest - # https://stackoverflow.com/questions/28521535/requests-how-to-disable-bypass-proxy - session = requests.Session() - session.trust_env = False - log.info(f"Downloading '{url}' to '{destination}'") - response = session.get(url) - Path(destination).parent.mkdir(parents=True, exist_ok=True) - with open(destination, "wb") as f: - f.write(response.content) + + # check if there are any files matching the filename, ignoring extension + matching_file = glob.glob(str(destination.with_suffix('.*'))) + if not matching_file: + # if url has a network scheme, download the file + if "http" in urllib.parse.urlparse(url).scheme: + # Disabling proxy speeds up requests time + # https://stackoverflow.com/questions/45783655/first-https-request-takes-much-more-time-than-the-rest + # https://stackoverflow.com/questions/28521535/requests-how-to-disable-bypass-proxy + session = requests.Session() + session.trust_env = False + log.info(f"Downloading '{url}'") + response = session.get(url) + + # if the filename does not have an extension at this point, + # try to infer it from the url, and if not possible, + # from the content-type header mimetype + if (not destination.suffix): + file_extension = Path(urllib.parse.urlparse(url).path).suffix + if (not file_extension): + content_type = response.headers['content-type'] + file_extension = mimetypes.guess_extension(content_type) + destination = destination.with_suffix(file_extension) + + Path(destination).parent.mkdir(parents=True, exist_ok=True) + with open(destination, "wb") as f: + f.write(response.content) + return destination.relative_to(self.dist_folder) + # if not, check if it's a local file, and copy it to the dist folder + else: + if Path(url).is_file(): + log.debug(f"Caching local file '{url}'") + destination = destination.with_suffix(Path(url).suffix) + shutil.copyfile(url, destination) + return destination.relative_to(self.dist_folder) + # if we already have a matching cached file, just return its relative path else: - log.debug(f"File '{destination}' was already downloaded") - return destination + cached_file = Path(matching_file[0]).relative_to(self.dist_folder) + log.debug(f"'{url}' was already downloaded") + return cached_file + # if all fails, return the original url + return url def init_chromedriver(self): log.info("Initialising chrome driver") @@ -167,7 +203,7 @@ class Parser(): service_log_path=str(Path.cwd() / "webdrive.log"), options=chrome_options) - def parse_page(self, url, processed_pages, index = None): + def parse_page(self, url, processed_pages = [], index = None): # if this is the first page being parse, set it as the index.html if (not index): index = url; @@ -187,10 +223,12 @@ class Parser(): log.critical("Timeout waiting for page content to load, or no content found. Are you sure the page is set to public?") return + # cooldown to allow eventual database items to load + # TODO: figure out a way to detect they loaded time.sleep(2) - # expands all the toggle block in the page to make their content visible - # we hook up our custom toggle logic afterwards + # function to expand all the toggle block in the page to make their content visible + # so we can hook up our custom toggle logic afterwards def open_toggle_blocks(exclude = []): opened_toggles = exclude; toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block") @@ -215,17 +253,23 @@ class Parser(): # if so, run the function again open_toggle_blocks(opened_toggles) + # open those toggle blocks! open_toggle_blocks() # creates soup from the page to start parsing soup = BeautifulSoup(self.driver.page_source, "html.parser") - # process eventual embedded iframes - for embed in soup.select('div[embed-ghost]'): - iframe = embed.find('iframe'); - iframe_parent = iframe.parent - iframe_parent['class'] = iframe_parent.get('class', []) + ['loconotion-iframe-target'] - iframe_parent['loconotion-iframe-src'] = iframe['src'] + # remove scripts and other tags we don't want / need + for unwanted in soup.findAll('script'): + unwanted.decompose(); + for intercom_frame in soup.findAll('div',{'id':'intercom-frame'}): + intercom_frame.decompose(); + for intercom_div in soup.findAll('div',{'class':'intercom-lightweight-app'}): + intercom_div.decompose(); + for overlay_div in soup.findAll('div',{'class':'notion-overlay-container'}): + overlay_div.decompose(); + for vendors_css in soup.find_all("link", href=lambda x: x and 'vendors~' in x): + vendors_css.decompose(); # clean up the default notion meta tags for tag in ["description", "twitter:card", "twitter:site", "twitter:title", "twitter:description", "twitter:image", "twitter:url", "apple-itunes-app"]: @@ -257,16 +301,8 @@ class Parser(): img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0] img_src = urllib.parse.unquote(img_src) - # generate an hashed id for the image filename based the url, - # so we avoid re-downloading images we have already downloaded, - # and figure out the filename from the url (I know, just this once) - img_extension = Path(urllib.parse.urlparse(img_src).path).suffix - #TODO: unsplash images don't have an extension in the url (they work though) - img_name = hashlib.sha1(str.encode(img_src)).hexdigest(); - img_file = img_name + img_extension - - self.cache_file(img_src, img_file) - img['src'] = img_file + cached_image = self.cache_file(img_src) + img['src'] = cached_image else: if (img['src'].startswith('/')): img['src'] = "https://www.notion.so" + img['src'] @@ -277,27 +313,18 @@ class Parser(): # we don't need the vendors stylesheet if ("vendors~" in link['href']): continue - css_file = link['href'].strip("/") - saved_css_file = self.cache_file('https://www.notion.so' + link['href'], css_file) - with open(saved_css_file, 'rb') as f: + # css_file = link['href'].strip("/") + cached_css_file = self.cache_file('https://www.notion.so' + link['href']) + with open(self.dist_folder / cached_css_file, 'rb') as f: stylesheet = cssutils.parseString(f.read()) # open the stylesheet and check for any font-face rule, for rule in stylesheet.cssRules: if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE: # if any are found, download the font file font_file = rule.style['src'].split("url(/")[-1].split(") format")[0] - self.cache_file(f'https://www.notion.so/{font_file}', font_file) - link['href'] = css_file - - # remove scripts and other tags we don't want / need - for unwanted in soup.findAll(['script', 'iframe']): - unwanted.decompose(); - for intercom_div in soup.findAll('div',{'class':'intercom-lightweight-app'}): - intercom_div.decompose(); - for overlay_div in soup.findAll('div',{'class':'notion-overlay-container'}): - overlay_div.decompose(); - for vendors_css in soup.find_all("link", href=lambda x: x and 'vendors~' in x): - vendors_css.decompose(); + cached_font_file = self.cache_file(f'https://www.notion.so/{font_file}') + rule.style['src'] = f"url({str(cached_font_file)})" + link['href'] = str(cached_css_file) # add our custom logic to all toggle blocks for toggle_block in soup.findAll('div',{'class':'notion-toggle-block'}): @@ -360,19 +387,21 @@ class Parser(): # if the value refers to a file, copy it to the dist folder if (attr.lower() == "href" or attr.lower() == "src"): log.debug(f"Copying injected file '{value}'") - source = (Path.cwd() / value.strip("/")) - destination = (self.dist_folder / source.name) - shutil.copyfile(source, destination) - injected_tag[attr] = source.name + cached_custom_file = self.cache_file((Path.cwd() / value.strip("/"))) + # destination = (self.dist_folder / source.name) + # shutil.copyfile(source, destination) + injected_tag[attr] = str(cached_custom_file) #source.name log.debug(f"Injecting <{section}> tag: {str(injected_tag)}") soup.find(section).append(injected_tag) injects_custom_tags("head") injects_custom_tags("body") # inject loconotion's custom stylesheet and script - custom_css = soup.new_tag("link", rel="stylesheet", href="loconotion.css") + loconotion_custom_css = self.cache_file("loconotion.css") + custom_css = soup.new_tag("link", rel="stylesheet", href=str(loconotion_custom_css)) soup.head.insert(-1, custom_css) - custom_script = soup.new_tag("script", type="text/javascript", src="loconotion.js") + loconotion_custom_js = self.cache_file("loconotion.js") + custom_script = soup.new_tag("script", type="text/javascript", src=str(loconotion_custom_js)) soup.body.insert(-1, custom_script) # find sub-pages and clean slugs / links @@ -393,17 +422,23 @@ class Parser(): processed_pages.append(url) # parse sub-pages - for sub_page in sub_pages: - if not sub_page in processed_pages: - self.parse_page(sub_page, processed_pages, index) + if (sub_pages): + if (processed_pages): log.debug(f"Pages processed so far: {processed_pages}") + for sub_page in sub_pages: + if not sub_page in processed_pages: + self.parse_page(sub_page, processed_pages, index) + + #we're all done! + return processed_pages def run(self, url): - processed_pages = [] - self.parse_page(url, processed_pages) + start_time = time.time() - # copy custom assets to dist folder - shutil.copyfile("loconotion.css", self.dist_folder / "loconotion.css"); - shutil.copyfile("loconotion.js", self.dist_folder / "loconotion.js"); + total_processed_pages = self.parse_page(url) + + elapsed_time = time.time() - start_time + formatted_time = '{:02d}:{:02d}:{:02d}'.format(int(elapsed_time // 3600), int(elapsed_time % 3600 // 60), int(elapsed_time % 60)) + log.info(f'Finished!\nヽ( ・‿・)ノ Processed {len(total_processed_pages)} pages in {formatted_time}') parser = argparse.ArgumentParser(description='Generate static websites from Notion.so pages') parser.add_argument('target', help='The config file containing the site properties, or the url of the Notion.so page to generate the site from') @@ -425,7 +460,7 @@ if __name__ == '__main__': if Path(args.target).is_file(): with open(args.target) as f: parsed_config = toml.loads(f.read()) - log.info("Initialising parser with configuration file") + log.info(f"Initialising parser with configuration file: {parsed_config}") Parser(parsed_config) else: log.critical(f"Config file {args.target} does not exists")