From a70c8b47083c9ba8bc5febed4d03535e4a4aced8 Mon Sep 17 00:00:00 2001 From: Leonardo Cavaletti Date: Sun, 17 May 2020 10:40:18 +0100 Subject: [PATCH] Added config file, custom fonts injection --- README.md | 23 ++++++- loconotion.py | 176 ++++++++++++++++++++++++++++++----------------- requirements.txt | 16 +++++ test_site.toml | 20 ++++++ 4 files changed, 170 insertions(+), 65 deletions(-) create mode 100644 requirements.txt create mode 100644 test_site.toml diff --git a/README.md b/README.md index 509d994..39138f3 100644 --- a/README.md +++ b/README.md @@ -1 +1,22 @@ -loconotion +# loconotion + +Notion.so is a popular application where you can create your own workspace. It's very easy to use. Notion also offer the possibility of making a page (and its sub-page) public on the web and because of this several people choose to use Notion to manage their personal blog, portfolio, or some kind of simple website. Notion however does not support custom domains when doing so: your pages are stuck in the `notion.so` domain, and computer-generated urls and slugs. + +Some services like Super, HostingPotion, HostNotion and Fruition cleverly tried to work around this issue by relying on a clever hack using CloudFlare workers. This solution, however, has some disadvantages: + +- Not free (Super, HostingPotion and HostNotion all take a monthly fee: Fruition is open-sourced but any domain with a decent amount of daily visit will soon clash against CloudFlare's free tier limitations, and force you to upgrade to the 5$ or more plan.) +- As the page is still hosted on Notion, it comes bundled with all their analytics, editing / collaboration javascript, vendors css, and more bloat which causes the page to load at speeds that are not exactly appropriate to a simple blog / website. Running [this](https://www.notion.so/The-perfect-It-s-Always-Sunny-in-Philadelphia-episode-d08aaec2b24946408e8be0e9f2ae857e) example page on Google's [PageSpeed Insights](https://developers.google.com/speed/pagespeed/insights/) scores a measly 24 / 66 on mobile / desktop. + +Enter Loconotion! + +Loconotion is a tool that approach this a bit differently. It lets Notion render the page, then parses it and saves a static version of the page to disk. While doing so, it strips out all the unnecessary bloat, and adds some extra css and js to keep the nice features like mobile responsiveness working. It also saves all related images / assets, and parses any subpage as well while keeping links intact, and cleaning up the urls. The result? A faster, self-contained version of the page that keeps all of Notion's nice layouts and eye candies, ready to be deployed on your CDN of choice. For a comparison, the same example page parsed with Loconotion and deployed on Netflify's free tier achieves a PageSpeed Insight score of 96 / 100! + +This approach also offers the advantage of being able to inject anything in th pages, from custom fonts to additional meta tags for SEO, or custom analytics. + +However, bear in mind that as we are effectively parsing a static version of the page, the following features will not work: +- All pages will open in their own page and not modals (this could be a pro, depending on how you look at it) +- Databases will be presented in their initial view - no switching views from table to gallery, for example. +- All editing features will be disabled - no ticking checkboxes or dragging kanban boards cards around. Usually not an issue as public pages usually have changes locked. +- Dynamic elements won't update automatically - for example, the calendar will not highlight the current date. + +Everything else, like embedding and dropdowns should work as expected. \ No newline at end of file diff --git a/loconotion.py b/loconotion.py index d7cb377..325023f 100644 --- a/loconotion.py +++ b/loconotion.py @@ -10,6 +10,8 @@ from rich.logging import RichHandler from rich.progress import Progress import urllib.parse import hashlib +import toml +import argparse from selenium import webdriver from selenium.webdriver.chrome.options import Options @@ -51,8 +53,7 @@ def download_file(url, destination): # https://stackoverflow.com/questions/28521535/requests-how-to-disable-bypass-proxy session = requests.Session() session.trust_env = False - parsed_url = urllib.parse.urlparse(url) - log.info(f"Downloading {parsed_url.scheme + parsed_url.netloc + parsed_url.path} to {destination}") + log.info(f"Downloading {url} to {destination}") response = session.get(url) Path(destination).parent.mkdir(parents=True, exist_ok=True) with open(destination, "wb") as f: @@ -84,19 +85,18 @@ def download_file(url, destination): # log.debug(f"File {destination} was already downloaded") # return destination -class notion_page_loaded(object): - """An expectation for checking that a notion page has loaded. - """ - def __call__(self, driver): - notion_presence = len(driver.find_elements_by_class_name("notion-presence-container")) - loading_spinners = len(driver.find_elements_by_class_name("loading-spinner")); - # embed_ghosts = len(driver.find_elements_by_css_selector("div[embed-ghost]")); - log.debug(f"Waiting for page content to load (presence container: {notion_presence}, loaders: {loading_spinners} )") - if (notion_presence and not loading_spinners): - return True - else: - return False - +# class notion_page_loaded(object): +# """An expectation for checking that a notion page has loaded. +# """ +# def __call__(self, driver): +# notion_presence = len(driver.find_elements_by_class_name("notion-presence-container")) +# loading_spinners = len(driver.find_elements_by_class_name("loading-spinner")); +# # embed_ghosts = len(driver.find_elements_by_css_selector("div[embed-ghost]")); +# log.debug(f"Waiting for page content to load (presence container: {notion_presence}, loaders: {loading_spinners} )") +# if (notion_presence and not loading_spinners): +# return True +# else: +# return False class toggle_block_has_opened(object): """An expectation for checking that a notion toggle block has been opened. @@ -120,14 +120,25 @@ class toggle_block_has_opened(object): return False class Parser(): - def __init__(self, dist_folder): - self.dist_folder = Path(dist_folder) - self.driver = self.init_chromedriver() + def __init__(self, config = {}): + url = config.get("page", None) + if not url: + log.error("No url specified") + return - # create output path if it doesn't exists + self.driver = self.init_chromedriver() + self.config = config + + # get the site name from the config, or make it up by cleaning the target page's slug + site_name = self.config.get("name", get_clean_slug(url, extension = False)) + + # set the output folder based on the site name, and create it if necessary + self.dist_folder = Path(config.get("output", Path("dist") / site_name)) self.dist_folder.mkdir(parents=True, exist_ok=True) log.info(f"Setting output path to {self.dist_folder}") + self.run(url) + def init_chromedriver(self): log.info("Initialising chrome driver") chrome_options = Options() @@ -150,11 +161,16 @@ class Parser(): log.info(f'Parsing page {url}') self.driver.get(url) + + # if ("This content does not exist" in self.driver.page_source): + # log.error(f"No content found in {url}. Are you sure the page is set to public?") + # return + try: # WebDriverWait(self.driver, 10).until(notion_page_loaded()) WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'notion-presence-container'))) except TimeoutException as ex: - log.error("Timeout waiting for page content to load") + log.error("Timeout waiting for page content to load, or no content found. Are you sure the page is set to public?") return time.sleep(2) @@ -188,7 +204,7 @@ class Parser(): open_toggle_blocks() # creates soup from the page to start parsing - soup = BeautifulSoup(self.driver.page_source, "lxml") + soup = BeautifulSoup(self.driver.page_source) # process eventual embedded iframes for embed in soup.select('div[embed-ghost]'): @@ -199,40 +215,39 @@ class Parser(): # process meta tags def set_meta_tag(prop_name, prop_value, content): + log.debug(f"Setting meta tag {prop_value} to '{content}'") tag = soup.find("meta", attrs = { prop_name : prop_value}) if (tag): - log.debug(f"Setting meta tag {prop_value} to {content}") if (content): tag["content"] = content else: tag.decompose(); else: - log.warn(f"Meta tag with {prop_name}: {prop_value} was not found") + tag = soup.new_tag('meta') + tag.attrs[prop_name] = prop_value + tag.attrs['content'] = content + soup.head.append(tag) - set_meta_tag("name", "description", None) - set_meta_tag("name", "twitter:card", None) - set_meta_tag("name", "twitter:site", None) - set_meta_tag("name", "twitter:title", None) - set_meta_tag("name", "twitter:description", None) - set_meta_tag("name", "twitter:image", None) - set_meta_tag("name", "twitter:url", None) - set_meta_tag("property", "og:site_name", None) - set_meta_tag("property", "og:type", None) - set_meta_tag("property", "og:url", None) - set_meta_tag("property", "og:title", None) - set_meta_tag("property", "og:description", None) - set_meta_tag("property", "og:image", None) - set_meta_tag("name", "apple-itunes-app", None) + # clean up the default notion meta tags + for tag in ["description", "twitter:card", "twitter:site", "twitter:title", "twitter:description", "twitter:image", "twitter:url", "apple-itunes-app"]: + set_meta_tag("name", tag, None) + for tag in ["og:site_name", "og:type", "og:url", "og:title", "og:description", "og:image"]: + set_meta_tag("property", tag, None) + + # set custom meta tags + for name, content in self.config.get("meta", {}).items(): + set_meta_tag("name", name, content) # process images cache_images = True for img in soup.findAll('img'): if img.has_attr('src'): - if (cache_images): + if (cache_images and not 'data:image' in img['src']): img_src = img['src'] # if the path starts with /, it's one of notion's predefined images if (img['src'].startswith('/')): # notion's images urls are in a weird format, need to sanitize them img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0] + img_src = urllib.parse.unquote(img_src) #TODO # generate an hashed id for the image filename based the url, # so we avoid re-downloading images we have already downloaded, @@ -286,27 +301,45 @@ class Parser(): toggle_content.attrs['loconotion-toggle-id'] = toggle_button.attrs['loconotion-toggle-id'] = toggle_id # embed custom google font - custom_font = None - if (custom_font): - custom_font_stylesheet_stylesheet = soup.new_tag("link") - custom_font_stylesheet.attrs["rel"] = "stylesheet" - custom_font_stylesheet.attrs["href"] = f"https://fonts.googleapis.com/css2?family={custom_font}:wght@500;600;700&display=swap" - soup.head.insert(-1, custom_font_stylesheet) - for app in soup.findAll('div',{'class':'notion-app-inner'}): - style = cssutils.parseStyle(app['style']); - style['font-family'] = f"'{custom_font}', {style['font-family']}" - app['style'] = style.cssText + fonts_selectors = { + "site" : "div:not(.notion-code-block)", + "navbar": ".notion-topbar div", + "title" : ".notion-page-block, .notion-collection_view_page-block", + "h1" : ".notion-header-block div", + "h2" : ".notion-sub_header-block div", + "h3" : ".notion-sub_sub_header-block div", + "body" : ".notion-app-inner", + "code" : ".notion-code-block *" + } - # append custom stylesheet - custom_css = soup.new_tag("link") - custom_css.attrs["rel"] = "stylesheet" - custom_css.attrs["href"] = "loconotion.css" + custom_fonts = self.config.get("fonts", {}) + if (custom_fonts): + # append a stylesheet importing the google font for each unique font + unique_custom_fonts = set(custom_fonts.values()) + for font in unique_custom_fonts: + custom_font_stylesheet = soup.new_tag("link", rel="stylesheet", + href=f"https://fonts.googleapis.com/css2?family={font}:wght@500;600;700&display=swap") + soup.head.append(custom_font_stylesheet); + + # go through each custom font, and add a css rule overriding the font-family + # to the font override stylesheet targetting the appropriate selector + font_override_stylesheet = soup.new_tag('style', type='text/css') + for target, custom_font in custom_fonts.items(): + if custom_font and not target == "site": + log.debug(f"Setting {target} font-family to {custom_font}") + font_override_stylesheet.append(fonts_selectors[target] + " {font-family:" + custom_font + " !important}") + site_font = custom_fonts.get("site", None) + # process global site font last to more granular settings can override it + if (site_font): + log.debug(f"Setting global site font-family to {site_font}") + font_override_stylesheet.append(fonts_selectors["site"] + " {font-family:" + site_font + "}") + + soup.head.append(font_override_stylesheet) + + # append custom stylesheet and script + custom_css = soup.new_tag("link", rel="stylesheet", href="loconotion.css") soup.head.insert(-1, custom_css) - - # append custom script - custom_script = soup.new_tag("script") - custom_script.attrs["type"] = "text/javascript" - custom_script.attrs["src"] = "loconotion.js" + custom_script = soup.new_tag("script", type="text/javascript", src="loconotion.js") soup.body.insert(-1, custom_script) # find sub-pages and clean slugs / links @@ -339,15 +372,30 @@ class Parser(): shutil.copyfile("loconotion.css", self.dist_folder / "loconotion.css"); shutil.copyfile("loconotion.js", self.dist_folder / "loconotion.js"); +parser = argparse.ArgumentParser(description='Generate static websites from Notion.so pages') +parser.add_argument('target', help='The config file containing the site properties, or the url of the Notion.so page to generate the site from') +args = parser.parse_args() + if __name__ == '__main__': try: - url = "https://www.notion.so/leoncvlt-f276385bf5ce42969497f0b03aef907e" - output_folder = Path("dist") / get_clean_slug(url, extension = False) - parser = Parser(output_folder) - parser.run(url) - # parser.run("https://www.notion.so/A-Notion-Page-03c403f4fdc94cc1b315b9469a8950ef") - # parser.run("https://www.notion.so/Media-be1a5c3e1c9640a0ab9ba0ba9b67e6a5") - # parser.run('https://www.notion.so/leoncvlt-f276385bf5ce42969497f0b03aef907e') + if urllib.parse.urlparse(args.target).scheme: + try: + response = requests.get(args.target) + if ("notion.so" in args.target): + log.info("Initialising parser with simple page url") + Parser({ "page" : args.target }) + else: + log.error(f"{args.target} is not a notion.so page") + except requests.ConnectionError as exception: + log.error(f"{args.target} does not seem to be an existing web page") + else: + if Path(args.target).is_file(): + with open(args.target) as f: + parsed_config = toml.loads(f.read()) + log.info("Initialising parser with configuration file") + Parser(parsed_config) + else: + log.error(f"Config file {args.target} does not exists") except KeyboardInterrupt: log.error('Interrupted by user') try: diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..247bbc0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +beautifulsoup4==4.9.0 +certifi==2020.4.5.1 +chardet==3.0.4 +colorama==0.4.3 +commonmark==0.9.1 +cssutils==1.0.2 +idna==2.9 +pprintpp==0.4.0 +Pygments==2.6.1 +requests==2.23.0 +rich==1.1.5 +selenium==3.141.0 +soupsieve==2.0 +toml==0.10.1 +typing-extensions==3.7.4.2 +urllib3==1.25.9 diff --git a/test_site.toml b/test_site.toml new file mode 100644 index 0000000..2105f2e --- /dev/null +++ b/test_site.toml @@ -0,0 +1,20 @@ +name = "Notion Test Site" +page = "https://www.notion.so/A-Notion-Page-03c403f4fdc94cc1b315b9469a8950ef" + +[meta] +title = "Loconotion Test Site" +description = "A static site generated from a Notion.so page using Loconotion" + +[fonts] +site = 'Roboto' +navbar = '' +title = '' +h1 = 'Nunito' +h2 = 'Nunito' +h3 = 'Nunito' +body = '' +code = '' + +[custom] +js = [] +css = [] \ No newline at end of file