diff --git a/download_commons_contributions.py b/download_commons_contributions.py index 1753552..a2dea6e 100755 --- a/download_commons_contributions.py +++ b/download_commons_contributions.py @@ -12,6 +12,7 @@ from flickr_mail.models import Contribution API_URL = "https://commons.wikimedia.org/w/api.php" USERNAME = "Edward" +CONSECUTIVE_KNOWN_BATCHES_TO_STOP = 3 # Identify ourselves properly to Wikimedia USER_AGENT = "CommonsContributionsDownloader/0.1 (edward@4angle.com)" @@ -93,6 +94,7 @@ def main() -> None: batch_num = 0 new_count = 0 continue_token = None + consecutive_known_batches = 0 while True: batch_num += 1 @@ -130,7 +132,18 @@ def main() -> None: session.commit() if batch_new == 0: - # All contributions in this batch already exist, we're caught up + consecutive_known_batches += 1 + print( + " Batch fully known " + f"({consecutive_known_batches}/" + f"{CONSECUTIVE_KNOWN_BATCHES_TO_STOP})" + ) + else: + consecutive_known_batches = 0 + + if consecutive_known_batches >= CONSECUTIVE_KNOWN_BATCHES_TO_STOP: + # Stop after a small overlap window of known-only batches. + # This catches recent historical gaps without full-history scans. print(" Caught up with existing data") break