Make commons contributions sync resilient to shallow gaps

This commit is contained in:
Edward Betts 2026-02-07 13:34:09 +00:00
parent e072279566
commit 4f67960fe1

View file

@ -12,6 +12,7 @@ from flickr_mail.models import Contribution
API_URL = "https://commons.wikimedia.org/w/api.php" API_URL = "https://commons.wikimedia.org/w/api.php"
USERNAME = "Edward" USERNAME = "Edward"
CONSECUTIVE_KNOWN_BATCHES_TO_STOP = 3
# Identify ourselves properly to Wikimedia # Identify ourselves properly to Wikimedia
USER_AGENT = "CommonsContributionsDownloader/0.1 (edward@4angle.com)" USER_AGENT = "CommonsContributionsDownloader/0.1 (edward@4angle.com)"
@ -93,6 +94,7 @@ def main() -> None:
batch_num = 0 batch_num = 0
new_count = 0 new_count = 0
continue_token = None continue_token = None
consecutive_known_batches = 0
while True: while True:
batch_num += 1 batch_num += 1
@ -130,7 +132,18 @@ def main() -> None:
session.commit() session.commit()
if batch_new == 0: if batch_new == 0:
# All contributions in this batch already exist, we're caught up consecutive_known_batches += 1
print(
" Batch fully known "
f"({consecutive_known_batches}/"
f"{CONSECUTIVE_KNOWN_BATCHES_TO_STOP})"
)
else:
consecutive_known_batches = 0
if consecutive_known_batches >= CONSECUTIVE_KNOWN_BATCHES_TO_STOP:
# Stop after a small overlap window of known-only batches.
# This catches recent historical gaps without full-history scans.
print(" Caught up with existing data") print(" Caught up with existing data")
break break