Make commons contributions sync resilient to shallow gaps
This commit is contained in:
parent
e072279566
commit
4f67960fe1
1 changed files with 14 additions and 1 deletions
|
|
@ -12,6 +12,7 @@ from flickr_mail.models import Contribution
|
|||
|
||||
API_URL = "https://commons.wikimedia.org/w/api.php"
|
||||
USERNAME = "Edward"
|
||||
CONSECUTIVE_KNOWN_BATCHES_TO_STOP = 3
|
||||
|
||||
# Identify ourselves properly to Wikimedia
|
||||
USER_AGENT = "CommonsContributionsDownloader/0.1 (edward@4angle.com)"
|
||||
|
|
@ -93,6 +94,7 @@ def main() -> None:
|
|||
batch_num = 0
|
||||
new_count = 0
|
||||
continue_token = None
|
||||
consecutive_known_batches = 0
|
||||
|
||||
while True:
|
||||
batch_num += 1
|
||||
|
|
@ -130,7 +132,18 @@ def main() -> None:
|
|||
session.commit()
|
||||
|
||||
if batch_new == 0:
|
||||
# All contributions in this batch already exist, we're caught up
|
||||
consecutive_known_batches += 1
|
||||
print(
|
||||
" Batch fully known "
|
||||
f"({consecutive_known_batches}/"
|
||||
f"{CONSECUTIVE_KNOWN_BATCHES_TO_STOP})"
|
||||
)
|
||||
else:
|
||||
consecutive_known_batches = 0
|
||||
|
||||
if consecutive_known_batches >= CONSECUTIVE_KNOWN_BATCHES_TO_STOP:
|
||||
# Stop after a small overlap window of known-only batches.
|
||||
# This catches recent historical gaps without full-history scans.
|
||||
print(" Caught up with existing data")
|
||||
break
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue