add-links/cmdline.py

115 lines
3.1 KiB
Python
Raw Normal View History

2023-10-04 12:56:21 +01:00
#!/usr/bin/python3
import collections
import json
import re
import sys
import time
import typing
from add_links import api
# from_title = sys.argv[1]
re_disambig = re.compile(r"^(.*) \((.*)\)$")
def article_title_to_search_query(title: str) -> str:
"""Convert from article title to search query string."""
m = re_disambig.match(title)
return f'"{m.group(1)}" AND "{m.group(2)}"' if m else f'"{title}"'
def run_search(q: str, limit: int | str = "max") -> dict[str, typing.Any]:
"""Search Wikipedia."""
params = {"list": "search", "srwhat": "text", "srlimit": limit, "srsearch": q}
return typing.cast(dict[str, typing.Any], api.api_get(params)["query"])
def search_no_link(q: str) -> tuple[int, list[dict[str, str | int]]]:
"""Search for mentions of article title with no link included."""
query = run_search(article_title_to_search_query(q) + f' -linksto:"{q}"', "max")
totalhits = query["searchinfo"]["totalhits"]
results = query["search"]
return (totalhits, results)
def search_count(q: str) -> int:
"""How often does this article title appear in Wikipedia."""
query = run_search(article_title_to_search_query(q), limit=0)
return typing.cast(int, query["searchinfo"]["totalhits"]) - 1
def search_count_with_link(q: str) -> int:
"""How often does this article title appear in Wikipedia."""
query = run_search(article_title_to_search_query(q) + f' linksto:"{q}"', limit=0)
return typing.cast(int, query["searchinfo"]["totalhits"])
def parse_contribs() -> list[tuple[str, int]]:
re_comment = re.compile(r"^link \[\[(.*)\]\] using")
links: collections.Counter[str] = collections.Counter()
for line in open("../wikipedia-contribs/contribs"):
if (
'"comment": "link ' not in line
or "using [[User:Edward/Find link|Find link]]" not in line
):
continue
comment = json.loads(line)["comment"]
m = re_comment.match(comment)
if not m:
continue
link = m.group(1)
if "|" not in link:
links[link] += 1
return links.most_common(200)
with open("examples") as f:
seen = {json.loads(line)["title"] for line in f}
out = open("examples", "a")
for from_title, num in parse_contribs():
if from_title in seen:
continue
count = search_count(from_title)
count_with_link = search_count_with_link(from_title)
ratio = float(count_with_link) / float(count)
print(from_title, count, count_with_link, f"{ratio:.1%}")
print(
json.dumps(
{"title": from_title, "total": count, "with_links": count_with_link}
),
file=out,
)
out.flush()
time.sleep(0.1)
out.close()
sys.exit(0)
count = search_count(from_title)
count_with_link = search_count_with_link(from_title)
ratio = float(count_with_link) / float(count)
print(count, count_with_link, f"{ratio:.1%}")
sys.exit(0)
totalhits, search_hits = search_no_link(from_title)
for hit in search_hits:
print(" ", hit)
print(count, count_with_link, f"{ratio:.1%}", totalhits, len(search_hits))
# ret = core.do_search(from_title)
# print(ret)