Initial commit
This commit is contained in:
commit
f07b407e7a
25 changed files with 2383 additions and 0 deletions
114
cmdline.py
Executable file
114
cmdline.py
Executable file
|
|
@ -0,0 +1,114 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import collections
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import typing
|
||||
|
||||
from add_links import api
|
||||
|
||||
# from_title = sys.argv[1]
|
||||
|
||||
re_disambig = re.compile(r"^(.*) \((.*)\)$")
|
||||
|
||||
|
||||
def article_title_to_search_query(title: str) -> str:
|
||||
"""Convert from article title to search query string."""
|
||||
m = re_disambig.match(title)
|
||||
return f'"{m.group(1)}" AND "{m.group(2)}"' if m else f'"{title}"'
|
||||
|
||||
|
||||
def run_search(q: str, limit: int | str = "max") -> dict[str, typing.Any]:
|
||||
"""Search Wikipedia."""
|
||||
params = {"list": "search", "srwhat": "text", "srlimit": limit, "srsearch": q}
|
||||
return typing.cast(dict[str, typing.Any], api.api_get(params)["query"])
|
||||
|
||||
|
||||
def search_no_link(q: str) -> tuple[int, list[dict[str, str | int]]]:
|
||||
"""Search for mentions of article title with no link included."""
|
||||
query = run_search(article_title_to_search_query(q) + f' -linksto:"{q}"', "max")
|
||||
totalhits = query["searchinfo"]["totalhits"]
|
||||
results = query["search"]
|
||||
return (totalhits, results)
|
||||
|
||||
|
||||
def search_count(q: str) -> int:
|
||||
"""How often does this article title appear in Wikipedia."""
|
||||
query = run_search(article_title_to_search_query(q), limit=0)
|
||||
return typing.cast(int, query["searchinfo"]["totalhits"]) - 1
|
||||
|
||||
|
||||
def search_count_with_link(q: str) -> int:
|
||||
"""How often does this article title appear in Wikipedia."""
|
||||
query = run_search(article_title_to_search_query(q) + f' linksto:"{q}"', limit=0)
|
||||
return typing.cast(int, query["searchinfo"]["totalhits"])
|
||||
|
||||
|
||||
def parse_contribs() -> list[tuple[str, int]]:
|
||||
re_comment = re.compile(r"^link \[\[(.*)\]\] using")
|
||||
|
||||
links: collections.Counter[str] = collections.Counter()
|
||||
|
||||
for line in open("../wikipedia-contribs/contribs"):
|
||||
if (
|
||||
'"comment": "link ' not in line
|
||||
or "using [[User:Edward/Find link|Find link]]" not in line
|
||||
):
|
||||
continue
|
||||
comment = json.loads(line)["comment"]
|
||||
|
||||
m = re_comment.match(comment)
|
||||
if not m:
|
||||
continue
|
||||
link = m.group(1)
|
||||
|
||||
if "|" not in link:
|
||||
links[link] += 1
|
||||
|
||||
return links.most_common(200)
|
||||
|
||||
|
||||
with open("examples") as f:
|
||||
seen = {json.loads(line)["title"] for line in f}
|
||||
|
||||
|
||||
out = open("examples", "a")
|
||||
for from_title, num in parse_contribs():
|
||||
if from_title in seen:
|
||||
continue
|
||||
count = search_count(from_title)
|
||||
count_with_link = search_count_with_link(from_title)
|
||||
ratio = float(count_with_link) / float(count)
|
||||
|
||||
print(from_title, count, count_with_link, f"{ratio:.1%}")
|
||||
print(
|
||||
json.dumps(
|
||||
{"title": from_title, "total": count, "with_links": count_with_link}
|
||||
),
|
||||
file=out,
|
||||
)
|
||||
out.flush()
|
||||
time.sleep(0.1)
|
||||
out.close()
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
count = search_count(from_title)
|
||||
count_with_link = search_count_with_link(from_title)
|
||||
ratio = float(count_with_link) / float(count)
|
||||
|
||||
print(count, count_with_link, f"{ratio:.1%}")
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
totalhits, search_hits = search_no_link(from_title)
|
||||
|
||||
for hit in search_hits:
|
||||
print(" ", hit)
|
||||
print(count, count_with_link, f"{ratio:.1%}", totalhits, len(search_hits))
|
||||
|
||||
# ret = core.do_search(from_title)
|
||||
# print(ret)
|
||||
Loading…
Add table
Add a link
Reference in a new issue