#!/usr/bin/python3 import collections import json import re import sys import time import typing from add_links import api # from_title = sys.argv[1] re_disambig = re.compile(r"^(.*) \((.*)\)$") def article_title_to_search_query(title: str) -> str: """Convert from article title to search query string.""" m = re_disambig.match(title) return f'"{m.group(1)}" AND "{m.group(2)}"' if m else f'"{title}"' def run_search(q: str, limit: int | str = "max") -> dict[str, typing.Any]: """Search Wikipedia.""" params = {"list": "search", "srwhat": "text", "srlimit": limit, "srsearch": q} return typing.cast(dict[str, typing.Any], api.api_get(params)["query"]) def search_no_link(q: str) -> tuple[int, list[dict[str, str | int]]]: """Search for mentions of article title with no link included.""" query = run_search(article_title_to_search_query(q) + f' -linksto:"{q}"', "max") totalhits = query["searchinfo"]["totalhits"] results = query["search"] return (totalhits, results) def search_count(q: str) -> int: """How often does this article title appear in Wikipedia.""" query = run_search(article_title_to_search_query(q), limit=0) return typing.cast(int, query["searchinfo"]["totalhits"]) - 1 def search_count_with_link(q: str) -> int: """How often does this article title appear in Wikipedia.""" query = run_search(article_title_to_search_query(q) + f' linksto:"{q}"', limit=0) return typing.cast(int, query["searchinfo"]["totalhits"]) def parse_contribs() -> list[tuple[str, int]]: """Parse user contributions.""" re_comment = re.compile(r"^link \[\[(.*)\]\] using") links: collections.Counter[str] = collections.Counter() for line in open("../wikipedia-contribs/contribs"): if ( '"comment": "link ' not in line or "using [[User:Edward/Find link|Find link]]" not in line ): continue comment = json.loads(line)["comment"] m = re_comment.match(comment) if not m: continue link = m.group(1) if "|" not in link: links[link] += 1 return links.most_common(200) def main() -> None: with open("examples") as f: seen = {json.loads(line)["title"] for line in f} out = open("examples", "a") for from_title, num in parse_contribs(): if from_title in seen: continue count = search_count(from_title) count_with_link = search_count_with_link(from_title) ratio = float(count_with_link) / float(count) print(from_title, count, count_with_link, f"{ratio:.1%}") print( json.dumps( {"title": from_title, "total": count, "with_links": count_with_link} ), file=out, ) out.flush() time.sleep(0.1) out.close() sys.exit(0) count = search_count(from_title) count_with_link = search_count_with_link(from_title) ratio = float(count_with_link) / float(count) print(count, count_with_link, f"{ratio:.1%}") sys.exit(0) totalhits, search_hits = search_no_link(from_title) for hit in search_hits: print(" ", hit) print(count, count_with_link, f"{ratio:.1%}", totalhits, len(search_hits)) # ret = core.do_search(from_title) # print(ret) if __name__ == "__main__": main()