Improve backup flow to wait for export readiness instead of fixed delay
Replace the old fixed 10-minute wait with readiness-based waits on the export link state, so exports start downloading as soon as Goodreads marks them ready. Also harden login selectors/timeouts, add fallback CSV fetching when browser download events do not fire, and add CLI modes for --check-login, --login-only, --no-headless, and --har-path. Closes #3
This commit is contained in:
parent
497e04a26f
commit
ae932b26b9
1 changed files with 130 additions and 31 deletions
153
backup.py
Normal file → Executable file
153
backup.py
Normal file → Executable file
|
|
@ -1,13 +1,17 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
"""Backup list of books from Goodreads."""
|
"""Backup list of books from Goodreads."""
|
||||||
|
|
||||||
|
import argparse
|
||||||
import configparser
|
import configparser
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
from datetime import date
|
from datetime import date
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
from playwright.sync_api import Page, Playwright, TimeoutError, sync_playwright
|
||||||
|
|
||||||
from playwright.sync_api import Page, Playwright, sync_playwright
|
|
||||||
|
|
||||||
import_url = "https://www.goodreads.com/review/import"
|
|
||||||
wait_mins = 10
|
|
||||||
refresh_backup = True
|
refresh_backup = True
|
||||||
|
|
||||||
today = date.today().isoformat() # current date in ISO format
|
today = date.today().isoformat() # current date in ISO format
|
||||||
|
|
@ -24,59 +28,119 @@ script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
def login(page: Page) -> None:
|
def login(page: Page) -> None:
|
||||||
"""Login to Goodreads."""
|
"""Login to Goodreads."""
|
||||||
page.goto("https://www.goodreads.com/")
|
|
||||||
page.get_by_role("link", name="Sign In").click()
|
page.get_by_role("link", name="Sign In").click()
|
||||||
page.get_by_role("button", name="Sign in with email").click()
|
page.get_by_role("button", name="Sign in with email").click()
|
||||||
page.get_by_label("Email").fill(config.get("login", "email"))
|
page.get_by_label("Email").fill(config.get("login", "email"))
|
||||||
page.get_by_label("Password").fill(config.get("login", "password"))
|
page.get_by_label("Password").fill(config.get("login", "password"))
|
||||||
page.get_by_label("Keep me signed in").click()
|
page.get_by_role("checkbox", name="Keep me signed in").check()
|
||||||
|
try:
|
||||||
page.get_by_label("Sign in").click()
|
page.get_by_label("Sign in").click()
|
||||||
|
except TimeoutError:
|
||||||
|
# The submit can succeed but never reach Playwright's expected navigation state.
|
||||||
def navigate_to_import_and_export(page: Page) -> None:
|
pass
|
||||||
"""Navigate to import and export."""
|
|
||||||
page.goto("https://www.goodreads.com/")
|
|
||||||
page.get_by_role("link", name="My Books").click()
|
|
||||||
page.get_by_role("link", name="Import and export").click()
|
|
||||||
|
|
||||||
|
|
||||||
def run_backup(page: Page) -> None:
|
def run_backup(page: Page) -> None:
|
||||||
"""Run backup."""
|
"""Run backup."""
|
||||||
|
import_url = "https://www.goodreads.com/review/import"
|
||||||
page.goto(import_url)
|
page.goto(import_url)
|
||||||
|
|
||||||
if refresh_backup:
|
if refresh_backup:
|
||||||
print("backup requested")
|
print("backup requested")
|
||||||
page.get_by_role("button", name="Export Library").click()
|
export_button = page.get_by_role("button", name="Export Library")
|
||||||
print(f"waiting for {wait_mins} minutes")
|
if export_button.count() > 0 and export_button.first.is_visible():
|
||||||
|
export_button.first.click()
|
||||||
|
print("waiting for export to be ready...")
|
||||||
|
|
||||||
page.wait_for_timeout(wait_mins * 60 * 1000)
|
# JS clears #exportFile when generation starts, then populates it when ready.
|
||||||
|
export_file_link = page.locator("#exportFile a")
|
||||||
print("reloading page")
|
try:
|
||||||
page.reload()
|
export_file_link.wait_for(state="hidden", timeout=30 * 1000)
|
||||||
|
except TimeoutError:
|
||||||
|
pass
|
||||||
|
export_file_link.wait_for(state="visible", timeout=15 * 60 * 1000)
|
||||||
|
print("export ready")
|
||||||
|
else:
|
||||||
|
print("Export Library button not found, using latest ready export link")
|
||||||
|
|
||||||
print("download export")
|
print("download export")
|
||||||
export_link = page.get_by_role("link", name="Your export from")
|
export_link = page.get_by_role("link", name="Your export from").first
|
||||||
|
export_link.wait_for(state="visible", timeout=2 * 60 * 1000)
|
||||||
print(export_link.text_content())
|
print(export_link.text_content())
|
||||||
with page.expect_download() as download_info:
|
href = export_link.get_attribute("href")
|
||||||
page.get_by_role("link", name="Your export from").click()
|
download_url = urljoin(page.url, href) if href else None
|
||||||
download = download_info.value
|
|
||||||
backup_dir = config.get("backup", "dir")
|
backup_dir = config.get("backup", "dir")
|
||||||
|
|
||||||
save_to = os.path.join(backup_dir, f"{today}_goodreads_library_export.csv")
|
save_to = os.path.join(backup_dir, f"{today}_goodreads_library_export.csv")
|
||||||
|
try:
|
||||||
|
with page.expect_download(timeout=30 * 1000) as download_info:
|
||||||
|
export_link.click()
|
||||||
|
download = download_info.value
|
||||||
download.save_as(save_to)
|
download.save_as(save_to)
|
||||||
|
except TimeoutError:
|
||||||
|
if not download_url:
|
||||||
|
raise RuntimeError("Could not find export download URL")
|
||||||
|
response = None
|
||||||
|
for _ in range(60):
|
||||||
|
response = page.context.request.get(download_url)
|
||||||
|
if response.ok:
|
||||||
|
with open(save_to, "wb") as fh:
|
||||||
|
fh.write(response.body())
|
||||||
|
break
|
||||||
|
time.sleep(2)
|
||||||
|
else:
|
||||||
|
assert response is not None
|
||||||
|
raise RuntimeError(f"Export download failed with status {response.status}")
|
||||||
|
|
||||||
|
|
||||||
def run(playwright: Playwright) -> None:
|
def has_auth(path: str) -> bool:
|
||||||
|
"""Check if auth.json exists and is non-empty."""
|
||||||
|
return os.path.exists(path) and os.path.getsize(path) > 0
|
||||||
|
|
||||||
|
|
||||||
|
def run(
|
||||||
|
playwright: Playwright,
|
||||||
|
headless: bool = True,
|
||||||
|
har_path: str | None = None,
|
||||||
|
mode: str = "backup",
|
||||||
|
) -> int:
|
||||||
"""Download export."""
|
"""Download export."""
|
||||||
browser = playwright.chromium.launch(headless=True)
|
browser = playwright.chromium.launch(headless=headless)
|
||||||
auth_json = os.path.join(script_dir, "auth.json")
|
auth_json = os.path.join(script_dir, "auth.json")
|
||||||
|
if has_auth(auth_json) and har_path:
|
||||||
|
context = browser.new_context(
|
||||||
|
storage_state=auth_json,
|
||||||
|
record_har_path=har_path,
|
||||||
|
record_har_mode="minimal",
|
||||||
|
)
|
||||||
|
elif has_auth(auth_json):
|
||||||
context = browser.new_context(storage_state=auth_json)
|
context = browser.new_context(storage_state=auth_json)
|
||||||
|
elif har_path:
|
||||||
|
context = browser.new_context(
|
||||||
|
record_har_path=har_path,
|
||||||
|
record_har_mode="minimal",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
context = browser.new_context()
|
||||||
page = context.new_page()
|
page = context.new_page()
|
||||||
|
|
||||||
# login(page)
|
|
||||||
|
|
||||||
page.goto("https://www.goodreads.com/")
|
page.goto("https://www.goodreads.com/")
|
||||||
|
logged_in = not page.get_by_role("link", name="Sign In").is_visible()
|
||||||
|
|
||||||
|
if mode == "check_login":
|
||||||
|
print("logged in" if logged_in else "not logged in")
|
||||||
|
page.close()
|
||||||
|
context.close()
|
||||||
|
browser.close()
|
||||||
|
return 0 if logged_in else 1
|
||||||
|
|
||||||
|
if not logged_in:
|
||||||
|
print("not logged in, logging in...")
|
||||||
|
login(page)
|
||||||
|
|
||||||
|
if mode == "login_only":
|
||||||
|
print("login complete")
|
||||||
|
else:
|
||||||
run_backup(page)
|
run_backup(page)
|
||||||
|
|
||||||
page.close()
|
page.close()
|
||||||
|
|
@ -84,7 +148,42 @@ def run(playwright: Playwright) -> None:
|
||||||
context.storage_state(path=auth_json)
|
context.storage_state(path=auth_json)
|
||||||
context.close()
|
context.close()
|
||||||
browser.close()
|
browser.close()
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Backup list of books from Goodreads.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-headless", action="store_true", help="run browser with a visible window"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--har-path",
|
||||||
|
help="record a HAR file for this run (includes login/export requests)",
|
||||||
|
)
|
||||||
|
mode_group = parser.add_mutually_exclusive_group()
|
||||||
|
mode_group.add_argument(
|
||||||
|
"--check-login",
|
||||||
|
action="store_true",
|
||||||
|
help="check whether current session is logged in and exit",
|
||||||
|
)
|
||||||
|
mode_group.add_argument(
|
||||||
|
"--login-only",
|
||||||
|
action="store_true",
|
||||||
|
help="perform login and save auth state without exporting CSV",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
mode = "backup"
|
||||||
|
if args.check_login:
|
||||||
|
mode = "check_login"
|
||||||
|
elif args.login_only:
|
||||||
|
mode = "login_only"
|
||||||
|
|
||||||
with sync_playwright() as playwright:
|
with sync_playwright() as playwright:
|
||||||
run(playwright)
|
sys.exit(
|
||||||
|
run(
|
||||||
|
playwright,
|
||||||
|
headless=not args.no_headless,
|
||||||
|
har_path=args.har_path,
|
||||||
|
mode=mode,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue