diff --git a/main.py b/main.py index 3e7c336..fa5bb9a 100755 --- a/main.py +++ b/main.py @@ -348,6 +348,14 @@ class ArticleWithoutImage: return f"/?enwp={quote(self.title)}" +@dataclasses.dataclass +class CategoryResult: + """Result of a paginated category search.""" + + articles: list[ArticleWithoutImage] + gcmcontinue: str | None + + # Common non-content images to ignore when checking if an article has images NON_CONTENT_IMAGE_PATTERNS = [ "OOjs UI icon", @@ -378,12 +386,16 @@ def has_content_image(images: list[dict]) -> bool: return False -def get_articles_without_images(category: str) -> list[ArticleWithoutImage]: +def get_articles_without_images( + category: str, + limit: int = 200, + gcmcontinue: str | None = None, +) -> CategoryResult: """Get articles in a category that don't have images. Uses generator=categorymembers with prop=images to efficiently check multiple articles in a single API request, following continuation until - all category members have been processed. + the limit is reached or all category members have been processed. """ params = { "action": "query", @@ -391,20 +403,25 @@ def get_articles_without_images(category: str) -> list[ArticleWithoutImage]: "gcmtitle": category, "gcmtype": "page", # Only articles, not subcategories or files "gcmnamespace": "0", # Main namespace only - "gcmlimit": "max", + "gcmlimit": "50", # Small batches so images fit in one response "prop": "images", - "imlimit": "max", # Need enough to check all pages in batch + "imlimit": "max", "format": "json", } headers = {"User-Agent": WIKIMEDIA_USER_AGENT} articles_without_images: list[ArticleWithoutImage] = [] - continue_token: str | None = None + seen_pageids: set[int] = set() + next_gcmcontinue: str | None = None + + # Build initial continue params from the external pagination token + continue_params: dict[str, str] = {} + if gcmcontinue: + continue_params = {"gcmcontinue": gcmcontinue, "continue": "gcmcontinue||"} while True: request_params = params.copy() - if continue_token: - request_params["gcmcontinue"] = continue_token + request_params.update(continue_params) try: response = requests.get( @@ -418,6 +435,11 @@ def get_articles_without_images(category: str) -> list[ArticleWithoutImage]: pages = data.get("query", {}).get("pages", {}) for page in pages.values(): + pageid = page.get("pageid", 0) + if not pageid or pageid in seen_pageids: + continue + seen_pageids.add(pageid) + images = page.get("images", []) # Skip if page has content images (not just UI icons) @@ -425,20 +447,29 @@ def get_articles_without_images(category: str) -> list[ArticleWithoutImage]: continue title = page.get("title", "") - pageid = page.get("pageid", 0) - - if title and pageid: + if title: articles_without_images.append( ArticleWithoutImage(title=title, pageid=pageid) ) - continue_token = data.get("continue", {}).get("gcmcontinue") - if not continue_token: + api_continue = data.get("continue") + if not api_continue: break + # Only stop at generator boundaries where we have a resumable token + gcmc = api_continue.get("gcmcontinue") + if gcmc and len(articles_without_images) >= limit: + next_gcmcontinue = gcmc + break + + continue_params = api_continue + # Sort by title for consistent display articles_without_images.sort(key=lambda a: a.title) - return articles_without_images + return CategoryResult( + articles=articles_without_images, + gcmcontinue=next_gcmcontinue, + ) def is_valid_flickr_image_url(url: str) -> bool: @@ -807,7 +838,8 @@ def category_search() -> str: ) log_interaction("search_category", query=category) - articles = get_articles_without_images(category) + gcmcontinue = flask.request.args.get("gcmcontinue") or None + result = get_articles_without_images(category, gcmcontinue=gcmcontinue) # Get the display name (without Category: prefix) category_name = category.replace("Category:", "") @@ -817,7 +849,8 @@ def category_search() -> str: cat=cat, category=category, category_name=category_name, - articles=articles, + articles=result.articles, + gcmcontinue=result.gcmcontinue, ) diff --git a/templates/category.html b/templates/category.html index 31319b6..62fcb1a 100644 --- a/templates/category.html +++ b/templates/category.html @@ -44,6 +44,12 @@ {% endfor %} + {% if gcmcontinue %} +