Make commons contributions sync resilient to shallow gaps
This commit is contained in:
parent
e072279566
commit
4f67960fe1
1 changed files with 14 additions and 1 deletions
|
|
@ -12,6 +12,7 @@ from flickr_mail.models import Contribution
|
||||||
|
|
||||||
API_URL = "https://commons.wikimedia.org/w/api.php"
|
API_URL = "https://commons.wikimedia.org/w/api.php"
|
||||||
USERNAME = "Edward"
|
USERNAME = "Edward"
|
||||||
|
CONSECUTIVE_KNOWN_BATCHES_TO_STOP = 3
|
||||||
|
|
||||||
# Identify ourselves properly to Wikimedia
|
# Identify ourselves properly to Wikimedia
|
||||||
USER_AGENT = "CommonsContributionsDownloader/0.1 (edward@4angle.com)"
|
USER_AGENT = "CommonsContributionsDownloader/0.1 (edward@4angle.com)"
|
||||||
|
|
@ -93,6 +94,7 @@ def main() -> None:
|
||||||
batch_num = 0
|
batch_num = 0
|
||||||
new_count = 0
|
new_count = 0
|
||||||
continue_token = None
|
continue_token = None
|
||||||
|
consecutive_known_batches = 0
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
batch_num += 1
|
batch_num += 1
|
||||||
|
|
@ -130,7 +132,18 @@ def main() -> None:
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
if batch_new == 0:
|
if batch_new == 0:
|
||||||
# All contributions in this batch already exist, we're caught up
|
consecutive_known_batches += 1
|
||||||
|
print(
|
||||||
|
" Batch fully known "
|
||||||
|
f"({consecutive_known_batches}/"
|
||||||
|
f"{CONSECUTIVE_KNOWN_BATCHES_TO_STOP})"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
consecutive_known_batches = 0
|
||||||
|
|
||||||
|
if consecutive_known_batches >= CONSECUTIVE_KNOWN_BATCHES_TO_STOP:
|
||||||
|
# Stop after a small overlap window of known-only batches.
|
||||||
|
# This catches recent historical gaps without full-history scans.
|
||||||
print(" Caught up with existing data")
|
print(" Caught up with existing data")
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue