Merge pull request #99 from 2m/fix/content-load-2m

More reliable content loading
This commit is contained in:
Leonardo Cavaletti 2022-04-03 17:56:23 +01:00 committed by GitHub
commit 093df46b2e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 12 additions and 31 deletions

View File

@ -6,6 +6,9 @@ log = logging.getLogger(f"loconotion.{__name__}")
class notion_page_loaded(object):
"""An expectation for checking that a notion page has loaded."""
def __init__(self):
self.previous_page_source = ""
def __call__(self, driver):
notion_presence = len(
driver.find_elements_by_class_name("notion-presence-container")
@ -19,18 +22,20 @@ class notion_page_loaded(object):
children = len(scroller.find_elements_by_tag_name("div"))
if children > 0:
scrollers_with_children.append(scroller)
source_changed = self.previous_page_source != driver.page_source
log.debug(
f"Waiting for page content to load"
f" (pending blocks: {unknown_blocks},"
f" loading spinners: {loading_spinners},"
f" loaded scrollers: {len(scrollers_with_children)} / {len(scrollers)})"
f" loaded scrollers: {len(scrollers_with_children)} / {len(scrollers)},"
f" source changed: {source_changed})"
)
all_scrollers_loaded = len(scrollers) == len(scrollers_with_children)
if (all_scrollers_loaded and not unknown_blocks and not loading_spinners):
if (all_scrollers_loaded and not unknown_blocks and not loading_spinners and not source_changed):
return True
else:
return False
else:
self.previous_page_source = driver.page_source
return False

View File

@ -232,7 +232,7 @@ class Parser:
chrome_options = Options()
if not self.args.get("non_headless", False):
chrome_options.add_argument("--headless")
chrome_options.add_argument("window-size=1920,1080")
chrome_options.add_argument("window-size=1920,20000")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--log-level=3")
@ -267,8 +267,6 @@ class Parser:
)
raise ex
self.scroll_to_the_bottom()
# open the toggle blocks in the page
self.open_toggle_blocks(self.args["timeout"])
@ -315,28 +313,6 @@ class Parser:
"__console.environment.ThemeStore.setState({ mode: 'dark' });"
)
def scroll_to_the_bottom(self):
# scroll at the bottom of the notion-scroller element to load all elements
# continue once there are no changes in height after a timeout
# don't do this if the page has a calendar databse on it or it will load forever
calendar = self.driver.find_elements_by_class_name("notion-calendar-view")
if not calendar:
scroller = self.driver.find_element_by_css_selector(
".notion-frame > .notion-scroller"
)
last_height = scroller.get_attribute("scrollHeight")
log.debug(f"Scrolling to bottom of notion-scroller (height: {last_height})")
while True:
self.driver.execute_script(
"arguments[0].scrollTo(0, arguments[0].scrollHeight)", scroller
)
time.sleep(self.args["timeout"])
new_height = scroller.get_attribute("scrollHeight")
log.debug(f"New notion-scroller height after timeout is: {new_height}")
if new_height == last_height:
break
last_height = new_height
def open_toggle_blocks(self, timeout: int, exclude=[]):
"""Expand all the toggle block in the page to make their content visible