From ae932b26b94d84f5832a1a4e4a66ea3c8b6e4623 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Tue, 17 Feb 2026 13:07:32 +0000 Subject: [PATCH] Improve backup flow to wait for export readiness instead of fixed delay Replace the old fixed 10-minute wait with readiness-based waits on the export link state, so exports start downloading as soon as Goodreads marks them ready. Also harden login selectors/timeouts, add fallback CSV fetching when browser download events do not fire, and add CLI modes for --check-login, --login-only, --no-headless, and --har-path. Closes #3 --- backup.py | 161 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 130 insertions(+), 31 deletions(-) mode change 100644 => 100755 backup.py diff --git a/backup.py b/backup.py old mode 100644 new mode 100755 index c1f7d35..2bb98ba --- a/backup.py +++ b/backup.py @@ -1,13 +1,17 @@ +#!/usr/bin/python3 """Backup list of books from Goodreads.""" +import argparse import configparser import os +import sys +import time from datetime import date +from urllib.parse import urljoin + +from playwright.sync_api import Page, Playwright, TimeoutError, sync_playwright -from playwright.sync_api import Page, Playwright, sync_playwright -import_url = "https://www.goodreads.com/review/import" -wait_mins = 10 refresh_backup = True today = date.today().isoformat() # current date in ISO format @@ -24,67 +28,162 @@ script_dir = os.path.dirname(os.path.abspath(__file__)) def login(page: Page) -> None: """Login to Goodreads.""" - page.goto("https://www.goodreads.com/") page.get_by_role("link", name="Sign In").click() page.get_by_role("button", name="Sign in with email").click() page.get_by_label("Email").fill(config.get("login", "email")) page.get_by_label("Password").fill(config.get("login", "password")) - page.get_by_label("Keep me signed in").click() - page.get_by_label("Sign in").click() - - -def navigate_to_import_and_export(page: Page) -> None: - """Navigate to import and export.""" - page.goto("https://www.goodreads.com/") - page.get_by_role("link", name="My Books").click() - page.get_by_role("link", name="Import and export").click() + page.get_by_role("checkbox", name="Keep me signed in").check() + try: + page.get_by_label("Sign in").click() + except TimeoutError: + # The submit can succeed but never reach Playwright's expected navigation state. + pass def run_backup(page: Page) -> None: """Run backup.""" + import_url = "https://www.goodreads.com/review/import" page.goto(import_url) if refresh_backup: print("backup requested") - page.get_by_role("button", name="Export Library").click() - print(f"waiting for {wait_mins} minutes") + export_button = page.get_by_role("button", name="Export Library") + if export_button.count() > 0 and export_button.first.is_visible(): + export_button.first.click() + print("waiting for export to be ready...") - page.wait_for_timeout(wait_mins * 60 * 1000) - - print("reloading page") - page.reload() + # JS clears #exportFile when generation starts, then populates it when ready. + export_file_link = page.locator("#exportFile a") + try: + export_file_link.wait_for(state="hidden", timeout=30 * 1000) + except TimeoutError: + pass + export_file_link.wait_for(state="visible", timeout=15 * 60 * 1000) + print("export ready") + else: + print("Export Library button not found, using latest ready export link") print("download export") - export_link = page.get_by_role("link", name="Your export from") + export_link = page.get_by_role("link", name="Your export from").first + export_link.wait_for(state="visible", timeout=2 * 60 * 1000) print(export_link.text_content()) - with page.expect_download() as download_info: - page.get_by_role("link", name="Your export from").click() - download = download_info.value + href = export_link.get_attribute("href") + download_url = urljoin(page.url, href) if href else None backup_dir = config.get("backup", "dir") save_to = os.path.join(backup_dir, f"{today}_goodreads_library_export.csv") - download.save_as(save_to) + try: + with page.expect_download(timeout=30 * 1000) as download_info: + export_link.click() + download = download_info.value + download.save_as(save_to) + except TimeoutError: + if not download_url: + raise RuntimeError("Could not find export download URL") + response = None + for _ in range(60): + response = page.context.request.get(download_url) + if response.ok: + with open(save_to, "wb") as fh: + fh.write(response.body()) + break + time.sleep(2) + else: + assert response is not None + raise RuntimeError(f"Export download failed with status {response.status}") -def run(playwright: Playwright) -> None: +def has_auth(path: str) -> bool: + """Check if auth.json exists and is non-empty.""" + return os.path.exists(path) and os.path.getsize(path) > 0 + + +def run( + playwright: Playwright, + headless: bool = True, + har_path: str | None = None, + mode: str = "backup", +) -> int: """Download export.""" - browser = playwright.chromium.launch(headless=True) + browser = playwright.chromium.launch(headless=headless) auth_json = os.path.join(script_dir, "auth.json") - context = browser.new_context(storage_state=auth_json) + if has_auth(auth_json) and har_path: + context = browser.new_context( + storage_state=auth_json, + record_har_path=har_path, + record_har_mode="minimal", + ) + elif has_auth(auth_json): + context = browser.new_context(storage_state=auth_json) + elif har_path: + context = browser.new_context( + record_har_path=har_path, + record_har_mode="minimal", + ) + else: + context = browser.new_context() page = context.new_page() - # login(page) - page.goto("https://www.goodreads.com/") + logged_in = not page.get_by_role("link", name="Sign In").is_visible() - run_backup(page) + if mode == "check_login": + print("logged in" if logged_in else "not logged in") + page.close() + context.close() + browser.close() + return 0 if logged_in else 1 + + if not logged_in: + print("not logged in, logging in...") + login(page) + + if mode == "login_only": + print("login complete") + else: + run_backup(page) page.close() context.storage_state(path=auth_json) context.close() browser.close() + return 0 +parser = argparse.ArgumentParser(description="Backup list of books from Goodreads.") +parser.add_argument( + "--no-headless", action="store_true", help="run browser with a visible window" +) +parser.add_argument( + "--har-path", + help="record a HAR file for this run (includes login/export requests)", +) +mode_group = parser.add_mutually_exclusive_group() +mode_group.add_argument( + "--check-login", + action="store_true", + help="check whether current session is logged in and exit", +) +mode_group.add_argument( + "--login-only", + action="store_true", + help="perform login and save auth state without exporting CSV", +) +args = parser.parse_args() + +mode = "backup" +if args.check_login: + mode = "check_login" +elif args.login_only: + mode = "login_only" + with sync_playwright() as playwright: - run(playwright) + sys.exit( + run( + playwright, + headless=not args.no_headless, + har_path=args.har_path, + mode=mode, + ) + )