More reliable content loading

Increases the height of the headless browser to 20000 pixels so no
scrolling is needed for all content to load.

This fixes a problem when some content in the middle of the screen would
not be loaded.

Also adds additional signal for the page load check that looks for any
changes in the page source. This fixes a situation where
`notion-scroller` already has some children but not all content
has been loaded yet.
This commit is contained in:
Martynas Mickevičius 2022-03-29 20:12:46 +03:00
parent 46c77076ab
commit f415e71586
No known key found for this signature in database
GPG Key ID: E735DF276C508071
2 changed files with 12 additions and 31 deletions

View File

@ -6,6 +6,9 @@ log = logging.getLogger(f"loconotion.{__name__}")
class notion_page_loaded(object):
"""An expectation for checking that a notion page has loaded."""
def __init__(self):
self.previous_page_source = ""
def __call__(self, driver):
notion_presence = len(
driver.find_elements_by_class_name("notion-presence-container")
@ -19,19 +22,21 @@ class notion_page_loaded(object):
children = len(scroller.find_elements_by_tag_name("div"))
if children > 0:
scrollers_with_children.append(scroller)
source_changed = self.previous_page_source != driver.page_source
log.debug(
f"Waiting for page content to load"
f" (pending blocks: {unknown_blocks},"
f" loading spinners: {loading_spinners},"
f" loaded scrollers: {len(scrollers_with_children)} / {len(scrollers)})"
f" loaded scrollers: {len(scrollers_with_children)} / {len(scrollers)},"
f" source changed: {source_changed})"
)
all_scrollers_loaded = len(scrollers) == len(scrollers_with_children)
if (all_scrollers_loaded and not unknown_blocks and not loading_spinners):
if (all_scrollers_loaded and not unknown_blocks and not loading_spinners and not source_changed):
return True
else:
return False
else:
return False
self.previous_page_source = driver.page_source
return False
class toggle_block_has_opened(object):

View File

@ -231,7 +231,7 @@ class Parser:
chrome_options = Options()
if not self.args.get("non_headless", False):
chrome_options.add_argument("--headless")
chrome_options.add_argument("window-size=1920,1080")
chrome_options.add_argument("window-size=1920,20000")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--log-level=3")
@ -266,8 +266,6 @@ class Parser:
)
return
self.scroll_to_the_bottom()
# open the toggle blocks in the page
self.open_toggle_blocks(self.args["timeout"])
@ -314,28 +312,6 @@ class Parser:
"__console.environment.ThemeStore.setState({ mode: 'dark' });"
)
def scroll_to_the_bottom(self):
# scroll at the bottom of the notion-scroller element to load all elements
# continue once there are no changes in height after a timeout
# don't do this if the page has a calendar databse on it or it will load forever
calendar = self.driver.find_elements_by_class_name("notion-calendar-view")
if not calendar:
scroller = self.driver.find_element_by_css_selector(
".notion-frame > .notion-scroller"
)
last_height = scroller.get_attribute("scrollHeight")
log.debug(f"Scrolling to bottom of notion-scroller (height: {last_height})")
while True:
self.driver.execute_script(
"arguments[0].scrollTo(0, arguments[0].scrollHeight)", scroller
)
time.sleep(self.args["timeout"])
new_height = scroller.get_attribute("scrollHeight")
log.debug(f"New notion-scroller height after timeout is: {new_height}")
if new_height == last_height:
break
last_height = new_height
def open_toggle_blocks(self, timeout: int, exclude=[]):
"""Expand all the toggle block in the page to make their content visible