Added support for lazy loaded page elements, and --timeout argument

This commit is contained in:
Leonardo Cavaletti
2020-08-15 16:09:27 +01:00
parent 29bf4969e2
commit c8f14a6678
3 changed files with 36 additions and 6 deletions

View File

@ -84,7 +84,7 @@ Here's what a full .toml configuration would look like, alongside with explanati
name = "Notion Test Site"
# the notion.so page to being parsing from. This page will become the index.html
# of the generated site, and loconotation will parse all sub-pages present on the page
# of the generated site, and loconotion will parse all sub-pages present on the page
page = "https://www.notion.so/Loconotion-Example-Page-03c403f4fdc94cc1b315b9469a8950ef"
## Global Site Settings ##
@ -185,6 +185,9 @@ On top of this, the script can take these optional arguments:
Use a specific chromedriver executable instead of the
auto-installing one
--single-page Only parse the first page, then stop
--timeout TIMEOUT Time in seconds to wait for the loading of lazy-loaded
dynamic elements (default 5). If content from the page
seems to be missing, try increasing this value
--clean Delete all previously cached files for the site before
generating it
--clean-css Delete previously cached .css files for the site
@ -210,6 +213,6 @@ On top of this, the script can take these optional arguments:
If you used Loconotion to build a cool site and want it added to the list above, shoot me a mail or submit a pull request!
## Support ![https://www.buymeacoffee.com/leoncvlt](https://img.shields.io/badge/-buy%20me%20a%20coffee-lightgrey?style=flat&logo=buy-me-a-coffee&color=FF813F&logoColor=white)
## Support [![Buy me a coffee](https://img.shields.io/badge/-buy%20me%20a%20coffee-lightgrey?style=flat&logo=buy-me-a-coffee&color=FF813F&logoColor=white "Buy me a coffee")](https://www.buymeacoffee.com/leoncvlt)
If you found this useful, consider [buying me a coffee](https://www.buymeacoffee.com/leoncvlt) so I get a a nice dose of methilxanthine, and you get a nice dose of karma.

View File

@ -34,6 +34,12 @@ def main():
argparser.add_argument(
"--single-page", action="store_true", help="Only parse the first page, then stop"
)
argparser.add_argument(
"--timeout",
default=5,
help="Time in seconds to wait for the loading of lazy-loaded dynamic elements (default 5)."
" If content from the page seems to be missing, try increasing this value",
)
argparser.add_argument(
"--clean",
action="store_true",

View File

@ -257,9 +257,30 @@ class Parser:
)
return
# scroll at the bottom of the notion-scroller element to load all elements
# continue once there are no changes in height after a timeout
# don't do this if the page has a calendar databse on it or it will load forever
calendar = self.driver.find_elements_by_class_name("notion-calendar-view")
if not calendar:
scroller = self.driver.find_element_by_css_selector(
".notion-frame > .notion-scroller"
)
last_height = scroller.get_attribute("scrollHeight")
log.debug(f"Scrolling to bottom of notion-scroller (height: {last_height})")
while True:
self.driver.execute_script(
"arguments[0].scrollTo(0, arguments[0].scrollHeight)", scroller
)
time.sleep(self.args["timeout"])
new_height = scroller.get_attribute("scrollHeight")
log.debug(f"New notion-scroller height after timeout is: {new_height}")
if new_height == last_height:
break
last_height = new_height
# function to expand all the toggle block in the page to make their content visible
# so we can hook up our custom toggle logic afterwards
def open_toggle_blocks(exclude=[]):
def open_toggle_blocks(timeout, exclude=[]):
opened_toggles = exclude
toggle_blocks = self.driver.find_elements_by_class_name("notion-toggle-block")
log.debug(f"Opening {len(toggle_blocks)} new toggle blocks in the page")
@ -278,7 +299,7 @@ class Parser:
# click on it, then wait until all elements are displayed
toggle_button.click()
try:
WebDriverWait(self.driver, 10).until(
WebDriverWait(self.driver, timeout).until(
toggle_block_has_opened(toggle_block)
)
except TimeoutException as ex:
@ -297,10 +318,10 @@ class Parser:
)
if len(new_toggle_blocks) > len(toggle_blocks):
# if so, run the function again
open_toggle_blocks(opened_toggles)
open_toggle_blocks(timeout, opened_toggles)
# open the toggle blocks in the page
open_toggle_blocks()
open_toggle_blocks(self.args["timeout"])
# creates soup from the page to start parsing
soup = BeautifulSoup(self.driver.page_source, "html.parser")