115 lines
3.1 KiB
Python
Executable file
115 lines
3.1 KiB
Python
Executable file
#!/usr/bin/python3
|
|
|
|
import collections
|
|
import json
|
|
import re
|
|
import sys
|
|
import time
|
|
import typing
|
|
|
|
from add_links import api
|
|
|
|
# from_title = sys.argv[1]
|
|
|
|
re_disambig = re.compile(r"^(.*) \((.*)\)$")
|
|
|
|
|
|
def article_title_to_search_query(title: str) -> str:
|
|
"""Convert from article title to search query string."""
|
|
m = re_disambig.match(title)
|
|
return f'"{m.group(1)}" AND "{m.group(2)}"' if m else f'"{title}"'
|
|
|
|
|
|
def run_search(q: str, limit: int | str = "max") -> dict[str, typing.Any]:
|
|
"""Search Wikipedia."""
|
|
params = {"list": "search", "srwhat": "text", "srlimit": limit, "srsearch": q}
|
|
return typing.cast(dict[str, typing.Any], api.api_get(params)["query"])
|
|
|
|
|
|
def search_no_link(q: str) -> tuple[int, list[dict[str, str | int]]]:
|
|
"""Search for mentions of article title with no link included."""
|
|
query = run_search(article_title_to_search_query(q) + f' -linksto:"{q}"', "max")
|
|
totalhits = query["searchinfo"]["totalhits"]
|
|
results = query["search"]
|
|
return (totalhits, results)
|
|
|
|
|
|
def search_count(q: str) -> int:
|
|
"""How often does this article title appear in Wikipedia."""
|
|
query = run_search(article_title_to_search_query(q), limit=0)
|
|
return typing.cast(int, query["searchinfo"]["totalhits"]) - 1
|
|
|
|
|
|
def search_count_with_link(q: str) -> int:
|
|
"""How often does this article title appear in Wikipedia."""
|
|
query = run_search(article_title_to_search_query(q) + f' linksto:"{q}"', limit=0)
|
|
return typing.cast(int, query["searchinfo"]["totalhits"])
|
|
|
|
|
|
def parse_contribs() -> list[tuple[str, int]]:
|
|
re_comment = re.compile(r"^link \[\[(.*)\]\] using")
|
|
|
|
links: collections.Counter[str] = collections.Counter()
|
|
|
|
for line in open("../wikipedia-contribs/contribs"):
|
|
if (
|
|
'"comment": "link ' not in line
|
|
or "using [[User:Edward/Find link|Find link]]" not in line
|
|
):
|
|
continue
|
|
comment = json.loads(line)["comment"]
|
|
|
|
m = re_comment.match(comment)
|
|
if not m:
|
|
continue
|
|
link = m.group(1)
|
|
|
|
if "|" not in link:
|
|
links[link] += 1
|
|
|
|
return links.most_common(200)
|
|
|
|
|
|
with open("examples") as f:
|
|
seen = {json.loads(line)["title"] for line in f}
|
|
|
|
|
|
out = open("examples", "a")
|
|
for from_title, num in parse_contribs():
|
|
if from_title in seen:
|
|
continue
|
|
count = search_count(from_title)
|
|
count_with_link = search_count_with_link(from_title)
|
|
ratio = float(count_with_link) / float(count)
|
|
|
|
print(from_title, count, count_with_link, f"{ratio:.1%}")
|
|
print(
|
|
json.dumps(
|
|
{"title": from_title, "total": count, "with_links": count_with_link}
|
|
),
|
|
file=out,
|
|
)
|
|
out.flush()
|
|
time.sleep(0.1)
|
|
out.close()
|
|
|
|
sys.exit(0)
|
|
|
|
|
|
count = search_count(from_title)
|
|
count_with_link = search_count_with_link(from_title)
|
|
ratio = float(count_with_link) / float(count)
|
|
|
|
print(count, count_with_link, f"{ratio:.1%}")
|
|
|
|
sys.exit(0)
|
|
|
|
totalhits, search_hits = search_no_link(from_title)
|
|
|
|
for hit in search_hits:
|
|
print(" ", hit)
|
|
print(count, count_with_link, f"{ratio:.1%}", totalhits, len(search_hits))
|
|
|
|
# ret = core.do_search(from_title)
|
|
# print(ret)
|