Improve backup flow to wait for export readiness instead of fixed delay

Replace the old fixed 10-minute wait with readiness-based waits on the export link state, so exports start downloading as soon as Goodreads marks them ready.

Also harden login selectors/timeouts, add fallback CSV fetching when browser download events do not fire, and add CLI modes for --check-login, --login-only, --no-headless, and --har-path.

Closes #3
This commit is contained in:
Edward Betts 2026-02-17 13:07:32 +00:00
parent 497e04a26f
commit ae932b26b9

153
backup.py Normal file → Executable file
View file

@ -1,13 +1,17 @@
#!/usr/bin/python3
"""Backup list of books from Goodreads.""" """Backup list of books from Goodreads."""
import argparse
import configparser import configparser
import os import os
import sys
import time
from datetime import date from datetime import date
from urllib.parse import urljoin
from playwright.sync_api import Page, Playwright, TimeoutError, sync_playwright
from playwright.sync_api import Page, Playwright, sync_playwright
import_url = "https://www.goodreads.com/review/import"
wait_mins = 10
refresh_backup = True refresh_backup = True
today = date.today().isoformat() # current date in ISO format today = date.today().isoformat() # current date in ISO format
@ -24,59 +28,119 @@ script_dir = os.path.dirname(os.path.abspath(__file__))
def login(page: Page) -> None: def login(page: Page) -> None:
"""Login to Goodreads.""" """Login to Goodreads."""
page.goto("https://www.goodreads.com/")
page.get_by_role("link", name="Sign In").click() page.get_by_role("link", name="Sign In").click()
page.get_by_role("button", name="Sign in with email").click() page.get_by_role("button", name="Sign in with email").click()
page.get_by_label("Email").fill(config.get("login", "email")) page.get_by_label("Email").fill(config.get("login", "email"))
page.get_by_label("Password").fill(config.get("login", "password")) page.get_by_label("Password").fill(config.get("login", "password"))
page.get_by_label("Keep me signed in").click() page.get_by_role("checkbox", name="Keep me signed in").check()
try:
page.get_by_label("Sign in").click() page.get_by_label("Sign in").click()
except TimeoutError:
# The submit can succeed but never reach Playwright's expected navigation state.
def navigate_to_import_and_export(page: Page) -> None: pass
"""Navigate to import and export."""
page.goto("https://www.goodreads.com/")
page.get_by_role("link", name="My Books").click()
page.get_by_role("link", name="Import and export").click()
def run_backup(page: Page) -> None: def run_backup(page: Page) -> None:
"""Run backup.""" """Run backup."""
import_url = "https://www.goodreads.com/review/import"
page.goto(import_url) page.goto(import_url)
if refresh_backup: if refresh_backup:
print("backup requested") print("backup requested")
page.get_by_role("button", name="Export Library").click() export_button = page.get_by_role("button", name="Export Library")
print(f"waiting for {wait_mins} minutes") if export_button.count() > 0 and export_button.first.is_visible():
export_button.first.click()
print("waiting for export to be ready...")
page.wait_for_timeout(wait_mins * 60 * 1000) # JS clears #exportFile when generation starts, then populates it when ready.
export_file_link = page.locator("#exportFile a")
print("reloading page") try:
page.reload() export_file_link.wait_for(state="hidden", timeout=30 * 1000)
except TimeoutError:
pass
export_file_link.wait_for(state="visible", timeout=15 * 60 * 1000)
print("export ready")
else:
print("Export Library button not found, using latest ready export link")
print("download export") print("download export")
export_link = page.get_by_role("link", name="Your export from") export_link = page.get_by_role("link", name="Your export from").first
export_link.wait_for(state="visible", timeout=2 * 60 * 1000)
print(export_link.text_content()) print(export_link.text_content())
with page.expect_download() as download_info: href = export_link.get_attribute("href")
page.get_by_role("link", name="Your export from").click() download_url = urljoin(page.url, href) if href else None
download = download_info.value
backup_dir = config.get("backup", "dir") backup_dir = config.get("backup", "dir")
save_to = os.path.join(backup_dir, f"{today}_goodreads_library_export.csv") save_to = os.path.join(backup_dir, f"{today}_goodreads_library_export.csv")
try:
with page.expect_download(timeout=30 * 1000) as download_info:
export_link.click()
download = download_info.value
download.save_as(save_to) download.save_as(save_to)
except TimeoutError:
if not download_url:
raise RuntimeError("Could not find export download URL")
response = None
for _ in range(60):
response = page.context.request.get(download_url)
if response.ok:
with open(save_to, "wb") as fh:
fh.write(response.body())
break
time.sleep(2)
else:
assert response is not None
raise RuntimeError(f"Export download failed with status {response.status}")
def run(playwright: Playwright) -> None: def has_auth(path: str) -> bool:
"""Check if auth.json exists and is non-empty."""
return os.path.exists(path) and os.path.getsize(path) > 0
def run(
playwright: Playwright,
headless: bool = True,
har_path: str | None = None,
mode: str = "backup",
) -> int:
"""Download export.""" """Download export."""
browser = playwright.chromium.launch(headless=True) browser = playwright.chromium.launch(headless=headless)
auth_json = os.path.join(script_dir, "auth.json") auth_json = os.path.join(script_dir, "auth.json")
if has_auth(auth_json) and har_path:
context = browser.new_context(
storage_state=auth_json,
record_har_path=har_path,
record_har_mode="minimal",
)
elif has_auth(auth_json):
context = browser.new_context(storage_state=auth_json) context = browser.new_context(storage_state=auth_json)
elif har_path:
context = browser.new_context(
record_har_path=har_path,
record_har_mode="minimal",
)
else:
context = browser.new_context()
page = context.new_page() page = context.new_page()
# login(page)
page.goto("https://www.goodreads.com/") page.goto("https://www.goodreads.com/")
logged_in = not page.get_by_role("link", name="Sign In").is_visible()
if mode == "check_login":
print("logged in" if logged_in else "not logged in")
page.close()
context.close()
browser.close()
return 0 if logged_in else 1
if not logged_in:
print("not logged in, logging in...")
login(page)
if mode == "login_only":
print("login complete")
else:
run_backup(page) run_backup(page)
page.close() page.close()
@ -84,7 +148,42 @@ def run(playwright: Playwright) -> None:
context.storage_state(path=auth_json) context.storage_state(path=auth_json)
context.close() context.close()
browser.close() browser.close()
return 0
parser = argparse.ArgumentParser(description="Backup list of books from Goodreads.")
parser.add_argument(
"--no-headless", action="store_true", help="run browser with a visible window"
)
parser.add_argument(
"--har-path",
help="record a HAR file for this run (includes login/export requests)",
)
mode_group = parser.add_mutually_exclusive_group()
mode_group.add_argument(
"--check-login",
action="store_true",
help="check whether current session is logged in and exit",
)
mode_group.add_argument(
"--login-only",
action="store_true",
help="perform login and save auth state without exporting CSV",
)
args = parser.parse_args()
mode = "backup"
if args.check_login:
mode = "check_login"
elif args.login_only:
mode = "login_only"
with sync_playwright() as playwright: with sync_playwright() as playwright:
run(playwright) sys.exit(
run(
playwright,
headless=not args.no_headless,
har_path=args.har_path,
mode=mode,
)
)