Initial commit

This commit is contained in:
Edward Betts 2023-10-04 12:56:21 +01:00
commit f07b407e7a
25 changed files with 2383 additions and 0 deletions

4
.gitignore vendored Normal file
View file

@ -0,0 +1,4 @@
__pycache__
.mypy_cache/
node_modules
package-lock.json

22
add_front_end_libraries.py Executable file
View file

@ -0,0 +1,22 @@
#!/usr/bin/python3
import os
import shutil
import subprocess
STATIC_DIR = "static"
assert os.path.exists("package.json") and os.path.exists("node_modules")
if not os.path.exists(STATIC_DIR):
os.mkdir(STATIC_DIR)
shutil.copytree(
"node_modules/bootstrap/dist/",
os.path.join(STATIC_DIR, "bootstrap"),
dirs_exist_ok=True,
)
subprocess.run(["npm", "run", "build"], check=True)
shutil.copy("dist/add_links.es.js", "static")

0
add_links/__init__.py Normal file
View file

284
add_links/api.py Normal file
View file

@ -0,0 +1,284 @@
import re
from typing import Any
import requests
from requests.adapters import HTTPAdapter
from simplejson.scanner import JSONDecodeError
from .language import get_current_language
from .util import is_disambig
ua = (
"find-link/2.2 "
+ "(https://github.com/EdwardBetts/find_link; contact: edward@4angle.com)"
)
re_disambig = re.compile(r"^(.*) \((.*)\)$")
def get_query_url() -> str:
"""Get the wikipedia query API for the current language."""
return f"https://{get_current_language()}.wikipedia.org/w/api.php"
sessions = {}
def get_session():
lang = get_current_language()
if lang in sessions:
return sessions[lang]
s = requests.Session()
s.headers = {"User-Agent": ua}
s.mount("https://en.wikipedia.org", HTTPAdapter(max_retries=10))
s.params = {
"format": "json",
"action": "query",
"formatversion": 2,
}
sessions[lang] = s
return s
class MediawikiError(Exception):
pass
class MultipleRedirects(Exception):
pass
class IncompleteReply(Exception):
pass
class MissingPage(Exception):
pass
def check_for_error(json_data):
if "error" in json_data:
raise MediawikiError(json_data["error"]["info"])
webpage_error = (
"Our servers are currently under maintenance or experiencing a technical problem."
)
def api_get(params: dict[str, Any]) -> dict[str, Any]:
"""Make call to Wikipedia API."""
s = get_session()
r = s.get(get_query_url(), params=params)
try:
ret = r.json()
except JSONDecodeError:
if webpage_error in r.text:
raise MediawikiError(webpage_error)
else:
raise MediawikiError("unknown error")
check_for_error(ret)
return ret
def get_first_page(params: dict[str, str]) -> dict[str, Any]:
"""Run Wikipedia API query and return the first page."""
page = api_get(params)["query"]["pages"][0]
if page.get("missing"):
raise MissingPage
return page
def random_article_list(limit=50):
params = {
"list": "random",
"rnnamespace": "0",
"rnlimit": limit,
}
return api_get(params)["query"]["random"]
def wiki_search(q):
m = re_disambig.match(q)
if m:
search = '"{}" AND "{}"'.format(*m.groups())
else:
search = '"{}"'.format(q)
params = {
"list": "search",
"srwhat": "text",
"srlimit": 50,
"srsearch": search,
"continue": "",
}
ret = api_get(params)
query = ret["query"]
totalhits = query["searchinfo"]["totalhits"]
results = query["search"]
for _ in range(10):
if "continue" not in ret:
break
params["sroffset"] = ret["continue"]["sroffset"]
ret = api_get(params)
results += ret["query"]["search"]
return (totalhits, results)
def get_wiki_info(q):
params = {
"prop": "info",
"redirects": "",
"titles": q,
}
ret = api_get(params)["query"]
if "interwiki" in ret:
return None
redirects = []
if ret.get("redirects"):
redirects = ret["redirects"]
if len(redirects) != 1:
# multiple redirects, we should explain to the user that this is
# unsupported
raise MultipleRedirects
if ret["pages"][0].get("missing"):
raise MissingPage(q)
return redirects[0]["to"] if redirects else None
def cat_start(q: str) -> list[str]:
"""Find categories that start with this prefix."""
params = {
"list": "allpages",
"apnamespace": 14, # categories
"apfilterredir": "nonredirects",
"aplimit": 500,
"apprefix": q,
}
ret = api_get(params)["query"]
return [i["title"] for i in ret["allpages"] if i["title"] != q]
def all_pages(q: str) -> list[str]:
"""Get all article titles with a given prefix."""
params = {
"list": "allpages",
"apnamespace": 0,
"apfilterredir": "nonredirects",
"aplimit": 500,
"apprefix": q,
}
ret = api_get(params)["query"]
return [i["title"] for i in ret["allpages"] if i["title"] != q]
def categorymembers(q: str) -> list[str]:
"""List of category members."""
params = {
"list": "categorymembers",
"cmnamespace": 0,
"cmlimit": 500,
"cmtitle": q[0].upper() + q[1:],
}
ret = api_get(params)["query"]
return [i["title"] for i in ret["categorymembers"] if i["title"] != q]
def page_links(titles): # unused
titles = list(titles)
assert titles
params = {
"prop": "links",
"pllimit": 500,
"plnamespace": 0,
"titles": "|".join(titles),
}
ret = api_get(params)["query"]
return dict(
(doc["title"], {l["title"] for l in doc["links"]})
for doc in ret["pages"].values()
if "links" in doc
)
def find_disambig(titles: list[str]) -> list[str]:
"""Find disambiguation articles in the given list of titles."""
titles = list(titles)
assert titles
pos = 0
disambig: list[str] = []
params = {
"prop": "templates",
"tllimit": 500,
"tlnamespace": 10, # templates
"continue": "",
}
while pos < len(titles):
params["titles"] = "|".join(titles[pos : pos + 50])
ret = api_get(params)
disambig.extend(
doc["title"] for doc in ret["query"]["pages"] if is_disambig(doc)
)
for i in range(10):
if "continue" not in ret:
break
tlcontinue = ret["continue"]["tlcontinue"]
params["titles"] = "|".join(titles[pos : pos + 50])
params["tlcontinue"] = tlcontinue
ret = api_get(params)
disambig.extend(
doc["title"] for doc in ret["query"]["pages"] if is_disambig(doc)
)
pos += 50
return disambig
def wiki_redirects(q): # pages that link here
params = {
"list": "backlinks",
"blfilterredir": "redirects",
"bllimit": 500,
"blnamespace": 0,
"bltitle": q,
}
docs = api_get(params)["query"]["backlinks"]
assert all("redirect" in doc for doc in docs)
return (doc["title"] for doc in docs)
def wiki_backlink(q: str) -> tuple[set[str], set[str]]:
"""Get backlinks for article."""
params = {
"list": "backlinks",
"bllimit": 500,
"blnamespace": 0,
"bltitle": q,
"continue": "",
}
ret = api_get(params)
docs = ret["query"]["backlinks"]
while "continue" in ret:
params["blcontinue"] = ret["continue"]["blcontinue"]
ret = api_get(params)
docs += ret["query"]["backlinks"]
articles = {doc["title"] for doc in docs if "redirect" not in doc}
redirects = {doc["title"] for doc in docs if "redirect" in doc}
return (articles, redirects)
def call_get_diff(title, section_num, section_text):
data = {
"prop": "revisions",
"rvprop": "timestamp",
"titles": title,
"rvsection": section_num,
"rvdifftotext": section_text.strip(),
}
s = get_session()
ret = s.post(get_query_url(), data=data).json()
check_for_error(ret)
return ret["query"]["pages"][0]["revisions"][0]["diff"]["body"]

198
add_links/core.py Normal file
View file

@ -0,0 +1,198 @@
"""Core functions."""
import html
import re
import typing
from pprint import pprint
from .api import (
MediawikiError,
all_pages,
cat_start,
categorymembers,
find_disambig,
get_first_page,
wiki_backlink,
wiki_search,
)
from .util import case_flip_first, norm
re_redirect = re.compile(r"#REDIRECT \[\[(.)([^#]*?)(#.*)?\]\]")
def get_content_and_timestamp(title: str) -> tuple[str, str]:
"""Get article content and timestamp of last update."""
params = {
"prop": "revisions|info",
"rvprop": "content|timestamp",
"titles": title,
}
json_data: dict[str, typing.Any] = get_first_page(params)
if json_data.get("invalid"):
raise MediawikiError(json_data["invalidreason"])
rev = json_data["revisions"][0]
return (rev["content"], rev["timestamp"])
def get_revision_info(title: str) -> dict[str, typing.Any]:
"""Get info about latest revision of article."""
params = {
"prop": "revisions|info",
"rvprop": "content|timestamp|ids",
"titles": title,
}
json_data: dict[str, typing.Any] = get_first_page(params)
if json_data.get("invalid"):
raise MediawikiError(json_data["invalidreason"])
revs = json_data.pop("revisions")
ret = revs[0]
ret["pageid"] = json_data["pageid"]
pprint(json_data)
return typing.cast(dict[str, typing.Any], ret)
def is_redirect_to(title_from: str, title_to: str) -> bool:
title_from = title_from.replace("_", " ")
params = {"prop": "info", "titles": title_from}
if "redirect" not in get_first_page(params):
return False
params = {"prop": "revisions", "rvprop": "content", "titles": title_from}
page_text = get_first_page(params)["revisions"][0]["content"]
m = re_redirect.match(page_text)
assert m
title_to = title_to[0].upper() + title_to[1:]
return m.group(1).upper() + m.group(2) == title_to
def find_longer(
q: str, search: list[dict[str, typing.Any]], articles: set[str]
) -> list[str]:
"""Find other articles with titles that are longer."""
this_title = q[0].upper() + q[1:]
longer: list[str] = all_pages(this_title)
lq = q.lower()
for doc in search:
lt = doc["title"].lower()
if lq == lt or lq not in lt:
continue
articles.add(doc["title"])
more_articles, more_redirects = wiki_backlink(doc["title"])
articles.update(more_articles)
if doc["title"] not in longer:
longer.append(doc["title"])
return longer
def tidy_snippet(snippet: str) -> str:
"""Remove HTML from snippet."""
snippet = snippet.replace("\u2013", "-")
snippet = snippet.replace("</span>", "")
snippet = snippet.replace('<span class="searchmatch">', "")
return html.unescape(snippet)
def match_type(q: str, snippet: str) -> str | None:
"""Discover match type, ''exact', 'case_mismatch' or None.
>>> match_type('foo', 'foo')
'exact'
>>> match_type('foo', 'bar') is None
True
>>> match_type('bar', 'foo bar baz')
'exact'
>>> match_type('clean coal technology', 'foo clean coal technologies baz')
'exact'
>>> match_type('bar', 'foo Bar baz')
'exact'
>>> match_type('bar', 'foo BAR baz')
'case_mismatch'
>>> match_type('foo-bar', 'aa foo-bar cc')
'exact'
>>> match_type(u'foo\u2013bar', 'aa foo-bar cc')
'exact'
"""
q = q.replace("\u2013", "-")
snippet = tidy_snippet(snippet)
if q in snippet or case_flip_first(q) in snippet:
return "exact"
match = None
if q.lower() in snippet.lower():
match = "case_mismatch"
if match != "exact" and q.endswith("y"):
if q[:-1] in snippet or case_flip_first(q[:-1]) in snippet:
return "exact"
elif match is None:
if q[:-1].lower() in snippet.lower():
match = "case_mismatch"
return match
def do_search(
q: str, redirect_to: str | None = None
) -> dict[str, int | list[dict[str, typing.Any]] | list[str] | None]:
this_title = q[0].upper() + q[1:]
totalhits, search_hits = wiki_search(q)
articles, redirects = wiki_backlink(redirect_to or q)
cm = set()
start = cat_start(q)
if len(start) > 5:
start = [] # big categories take too long
for cat in set(["Category:" + this_title] + start):
cm.update(categorymembers(cat))
norm_q = norm(q)
norm_match_redirect = {r for r in redirects if norm(r) == norm_q}
longer_redirect = {r for r in redirects if q.lower() in r.lower()}
articles.add(this_title)
if redirect_to:
articles.add(redirect_to[0].upper() + redirect_to[1:])
longer_redirect = {r for r in redirects if q.lower() in r.lower()}
for r in norm_match_redirect | longer_redirect:
articles.add(r)
a2, r2 = wiki_backlink(r)
articles.update(a2)
redirects.update(r2)
longer = find_longer(q, search_hits, articles) if len(q) > 6 else None
search: list[dict[str, typing.Any]] = [
doc
for doc in search_hits
if doc["title"] not in articles and doc["title"] not in cm
]
if search:
disambig = set(find_disambig([doc["title"] for doc in search]))
search = [doc for doc in search if doc["title"] not in disambig]
# and (doc['title'] not in links or this_title not in links[doc['title']])]
for doc in search:
without_markup = (
doc["snippet"]
.replace("<span class='searchmatch'>", "")
.replace("</span>", "")
.replace(" ", " ")
)
doc["match"] = match_type(q, without_markup)
doc["snippet_without_markup"] = without_markup
return {
"totalhits": totalhits,
"results": search,
"longer": longer,
}
def get_case_from_content(title: str) -> str | None:
"""Check article content to find the case of the article title."""
content, timestamp = get_content_and_timestamp(title)
if title == title.lower() and title in content:
return title
start = content.lower().find("'''" + title.replace("_", " ").lower() + "'''")
if start != -1:
return content[start + 3 : start + 3 + len(title)]
return None # article doesn't contain the title

146
add_links/language.py Normal file
View file

@ -0,0 +1,146 @@
from flask import session, has_request_context
langs = [
('af', 'Afrikaans', 'Afrikaans'),
('als', 'Alemannisch', 'Alemannic'),
('am', 'አማርኛ', 'Amharic'),
('an', 'aragonés', 'Aragonese'),
('ar', 'العربية', 'Arabic'),
('arz', 'مصرى', 'Egyptian Arabic'),
('ast', 'asturianu', 'Asturian'),
('az', 'azərbaycanca', 'Azerbaijani'),
('azb', 'تۆرکجه', 'Southern Azerbaijani'),
('ba', 'башҡортса', 'Bashkir'),
('bar', 'Boarisch', 'Bavarian'),
('bat-smg', 'žemaitėška', 'Samogitian'),
('be', 'беларуская', 'Belarusian'),
('be-tarask', 'беларуская (тарашкевіца)', 'Belarusian (Taraškievica)'),
('bg', 'български', 'Bulgarian'),
('bn', 'বাংলা', 'Bengali'),
('bpy', 'বিষ্ণুপ্রিয়া মণিপুরী', 'Bishnupriya Manipuri'),
('br', 'brezhoneg', 'Breton'),
('bs', 'bosanski', 'Bosnian'),
('bug', 'ᨅᨔ ᨕᨘᨁᨗ', 'Buginese'),
('ca', 'català', 'Catalan'),
('ce', 'нохчийн', 'Chechen'),
('ceb', 'Cebuano', 'Cebuano'),
('ckb', 'کوردیی ناوەندی', 'Kurdish (Sorani)'),
('cs', 'čeština', 'Czech'),
('cv', 'Чӑвашла', 'Chuvash'),
('cy', 'Cymraeg', 'Welsh'),
('da', 'dansk', 'Danish'),
('de', 'Deutsch', 'German'),
('el', 'Ελληνικά', 'Greek'),
('en', 'English', 'English'),
('eo', 'Esperanto', 'Esperanto'),
('es', 'español', 'Spanish'),
('et', 'eesti', 'Estonian'),
('eu', 'euskara', 'Basque'),
('fa', 'فارسی', 'Persian'),
('fi', 'suomi', 'Finnish'),
('fo', 'føroyskt', 'Faroese'),
('fr', 'français', 'French'),
('fy', 'Frysk', 'West Frisian'),
('ga', 'Gaeilge', 'Irish'),
('gd', 'Gàidhlig', 'Scottish Gaelic'),
('gl', 'galego', 'Galician'),
('gu', 'ગુજરાતી', 'Gujarati'),
('he', 'עברית', 'Hebrew'),
('hi', 'हिन्दी', 'Hindi'),
('hr', 'hrvatski', 'Croatian'),
('hsb', 'hornjoserbsce', 'Upper Sorbian'),
('ht', 'Kreyòl ayisyen', 'Haitian'),
('hu', 'magyar', 'Hungarian'),
('hy', 'Հայերեն', 'Armenian'),
('ia', 'interlingua', 'Interlingua'),
('id', 'Bahasa Indonesia', 'Indonesian'),
('io', 'Ido', 'Ido'),
('is', 'íslenska', 'Icelandic'),
('it', 'italiano', 'Italian'),
('ja', '日本語', 'Japanese'),
('jv', 'Basa Jawa', 'Javanese'),
('ka', 'ქართული', 'Georgian'),
('kk', 'қазақша', 'Kazakh'),
('kn', 'ಕನ್ನಡ', 'Kannada'),
('ko', '한국어', 'Korean'),
('ku', 'Kurdî', 'Kurdish (Kurmanji)'),
('ky', 'Кыргызча', 'Kirghiz'),
('la', 'Latina', 'Latin'),
('lb', 'Lëtzebuergesch', 'Luxembourgish'),
('li', 'Limburgs', 'Limburgish'),
('lmo', 'lumbaart', 'Lombard'),
('lt', 'lietuvių', 'Lithuanian'),
('lv', 'latviešu', 'Latvian'),
('map-bms', 'Basa Banyumasan', 'Banyumasan'),
('mg', 'Malagasy', 'Malagasy'),
('min', 'Baso Minangkabau', 'Minangkabau'),
('mk', 'македонски', 'Macedonian'),
('ml', 'മലയാളം', 'Malayalam'),
('mn', 'монгол', 'Mongolian'),
('mr', 'मराठी', 'Marathi'),
('mrj', 'кырык мары', 'Hill Mari'),
('ms', 'Bahasa Melayu', 'Malay'),
('my', 'မြန်မာဘာသာ', 'Burmese'),
('mzn', 'مازِرونی', 'Mazandarani'),
('nah', 'Nāhuatl', 'Nahuatl'),
('nap', 'Napulitano', 'Neapolitan'),
('nds', 'Plattdüütsch', 'Low Saxon'),
('ne', 'नेपाली', 'Nepali'),
('new', 'नेपाल भाषा', 'Newar'),
('nl', 'Nederlands', 'Dutch'),
('nn', 'norsk nynorsk', 'Norwegian (Nynorsk)'),
('no', 'norsk bokmål', 'Norwegian (Bokmål)'),
('oc', 'occitan', 'Occitan'),
('or', 'ଓଡ଼ିଆ', 'Oriya'),
('os', 'Ирон', 'Ossetian'),
('pa', 'ਪੰਜਾਬੀ', 'Eastern Punjabi'),
('pl', 'polski', 'Polish'),
('pms', 'Piemontèis', 'Piedmontese'),
('pnb', 'پنجابی', 'Western Punjabi'),
('pt', 'português', 'Portuguese'),
('qu', 'Runa Simi', 'Quechua'),
('ro', 'română', 'Romanian'),
('ru', 'русский', 'Russian'),
('sa', 'संस्कृतम्', 'Sanskrit'),
('sah', 'саха тыла', 'Sakha'),
('scn', 'sicilianu', 'Sicilian'),
('sco', 'Scots', 'Scots'),
('sh', 'srpskohrvatski / српскохрватски', 'Serbo-Croatian'),
('si', 'සිංහල', 'Sinhalese'),
('simple', 'Simple English', 'Simple English'),
('sk', 'slovenčina', 'Slovak'),
('sl', 'slovenščina', 'Slovenian'),
('sq', 'shqip', 'Albanian'),
('sr', 'српски / srpski', 'Serbian'),
('su', 'Basa Sunda', 'Sundanese'),
('sv', 'svenska', 'Swedish'),
('sw', 'Kiswahili', 'Swahili'),
('ta', 'தமிழ்', 'Tamil'),
('te', 'తెలుగు', 'Telugu'),
('tg', 'тоҷикӣ', 'Tajik'),
('th', 'ไทย', 'Thai'),
('tl', 'Tagalog', 'Tagalog'),
('tr', 'Türkçe', 'Turkish'),
('tt', 'татарча/tatarça', 'Tatar'),
('uk', 'українська', 'Ukrainian'),
('ur', 'اردو', 'Urdu'),
('uz', 'oʻzbekcha/ўзбекча', 'Uzbek'),
('vec', 'vèneto', 'Venetian'),
('vi', 'Tiếng Việt', 'Vietnamese'),
('vo', 'Volapük', 'Volapük'),
('wa', 'walon', 'Walloon'),
('war', 'Winaray', 'Waray'),
('yi', 'ייִדיש', 'Yiddish'),
('yo', 'Yorùbá', 'Yoruba'),
('zh', '中文', 'Chinese'),
('zh-min-nan', 'Bân-lâm-gú', 'Min Nan'),
('zh-yue', '粵語', 'Cantonese'),
]
def get_langs() -> list[dict[str, str]]:
"""List of all known languages."""
return [dict(zip(('code', 'local', 'english'), l)) for l in langs]
def get_current_language() -> str:
"""Return ISO-3166 language code for the current language."""
return session.get('current_lang', 'en') if has_request_context() else 'en'

381
add_links/match.py Normal file
View file

@ -0,0 +1,381 @@
from __future__ import unicode_literals
import re
import typing
from .api import MissingPage, call_get_diff, get_wiki_info
from .core import get_case_from_content, get_content_and_timestamp, get_revision_info
from .util import is_title_case, lc_alpha
re_link_in_text = re.compile(r"\[\[[^]]+?\]\]", re.I | re.S)
class LinkReplace(Exception):
pass
en_dash = "\u2013"
trans = {",": ",?", " ": " *[-\n]? *"}
trans[en_dash] = trans[" "]
trans2 = {" ": r"('?s?\]\])?'?s? ?(\[\[(?:.+\|)?)?", "-": "[- ]"}
trans2[en_dash] = trans2[" "]
patterns = [
lambda q: re.compile(
r"(?<!-)(?:\[\[(?:[^]]+\|)?)?(%s)%s(?:\]\])?"
% (
re.escape(q[0]),
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
),
re.I,
),
lambda q: re.compile(
r"(?<!-)\[\[[^|]+\|(%s)%s\]\]" % (re.escape(q[0]), re.escape(q[1:])), re.I
),
lambda q: re.compile(
r"(?<!-)\[\[[^|]+\|(%s)%s(?:\]\])?"
% (
re.escape(q[0]),
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
),
re.I,
),
lambda q: re.compile(r"(?<!-)(%s)%s" % (re.escape(q[0]), re.escape(q[1:])), re.I),
lambda q: re.compile(
r"(?<!-)(%s)%s"
% (
re.escape(q[0]),
"".join((trans[c] if c in trans else re.escape(c)) for c in q[1:]),
),
re.I,
),
]
class NoMatch(Exception):
pass
re_cite = re.compile(
r"<ref( [^>]*?)?>\s*({{cite.*?}}|\[https?://[^]]*?\])\s*</ref>", re.I | re.S
)
def parse_cite(text: str) -> typing.Iterator[tuple[str, str]]:
"""Parse a citation template."""
prev = 0
for m in re_cite.finditer(text):
yield ("text", text[prev : m.start()])
yield ("cite", m.group(0))
prev = m.end()
yield ("text", text[prev:])
re_heading = re.compile(r"^\s*(=+)\s*(.+)\s*\1(<!--.*-->|\s)*$")
def section_iter(text: str) -> typing.Iterator[tuple[str | None, str]]:
"""Iterate sections yielding tuples of heading and section text."""
cur_section = ""
heading = None
in_comment = False
for line in text.splitlines(True):
if "<!--" in line:
in_comment = True
if "-->" in line:
in_comment = False
m = re_heading.match(line)
if in_comment or not m:
cur_section += line
continue
if cur_section or heading:
yield (heading, cur_section)
heading = m.group()
cur_section = ""
continue
yield (heading, cur_section)
def get_subsections(text: str, section_num: int) -> str:
"retrieve the text of subsections for a given section number within an article"
found = ""
collection_level = None
for num, (heading, body) in enumerate(section_iter(text)):
if heading is None:
level = 0
else:
m = re_heading.match(heading)
assert m
level = len(m.group(1))
if num == section_num:
collection_level = level
continue
if collection_level:
if level > collection_level:
assert heading
found += heading + body
else:
break
return found
def match_found(m, q, linkto):
if q[1:] == m.group(0)[1:]:
replacement = m.group(1) + q[1:]
elif any(c.isupper() for c in q[1:]) or m.group(0) == m.group(0).upper():
replacement = q
elif is_title_case(m.group(0)):
replacement = None
replacement = get_case_from_content(q)
if replacement is None:
replacement = q.lower()
else:
replacement = m.group(1) + q[1:]
assert replacement
if linkto:
if linkto[0].isupper() and replacement[0] == linkto[0].lower():
linkto = linkto[0].lower() + linkto[1:]
elif replacement[0].isupper():
linkto = linkto[0].upper() + linkto[1:]
replacement = linkto + "|" + replacement
return replacement
def parse_links(text: str) -> typing.Iterator[tuple[str, str]]:
prev = 0
for m in re_link_in_text.finditer(text):
if prev != m.start():
yield ("text", text[prev : m.start()])
if any(
m.group().lower().startswith("[[" + prefix)
for prefix in ("file:", "image:")
):
yield ("image", m.group(0))
else:
yield ("link", m.group(0))
prev = m.end()
if prev < len(text):
yield ("text", text[prev:])
def mk_link_matcher(q):
re_links = [p(q) for p in patterns]
def search_for_link(text):
for re_link in re_links:
m = re_link.search(text)
if m and m.group(0).count("[[") < 4:
return m
return search_for_link
def add_link(m, replacement, text):
return m.re.sub(lambda m: "[[%s]]" % replacement, text, count=1)
def find_link_in_chunk(q, content, linkto=None):
search_for_link = mk_link_matcher(q)
new_content = ""
replacement = None
match_in_non_link = False
bad_link_match = False
found_text_to_link = None
for token_type, text in parse_links(content):
if token_type == "text":
if search_for_link(text):
match_in_non_link = True
elif token_type == "image":
before, sep, link_text = text[:-2].rpartition("|")
m = search_for_link(link_text)
if m:
found_text_to_link = m.group(0)
replacement = match_found(m, q, linkto)
text = before + sep + add_link(m, replacement, link_text) + "]]"
elif token_type == "link" and not replacement and not match_in_non_link:
link_text = text[2:-2]
link_dest = None
if "|" in link_text:
link_dest, link_text = link_text.split("|", 1)
m = search_for_link(link_text)
if m and (not link_dest or not link_dest.startswith("#")):
lc_alpha_q = lc_alpha(q)
bad_link_match = (
link_dest
and len(link_dest) > len(q)
and (lc_alpha_q not in lc_alpha(link_dest))
)
if not link_dest:
if q in link_text and len(link_text) > len(q):
bad_link_match = True
if bad_link_match and link_dest:
try:
link_dest_redirect = get_wiki_info(link_dest)
except MissingPage:
link_dest_redirect = None
if (
link_dest_redirect
and lc_alpha(link_dest_redirect) == lc_alpha_q
):
bad_link_match = False
if not bad_link_match:
replacement = match_found(m, q, linkto)
found_text_to_link = m.group(0)
text = add_link(m, replacement, link_text)
new_content += text
if not replacement:
if bad_link_match:
raise LinkReplace
m = search_for_link(content)
if m:
found_text_to_link = m.group(0)
replacement = match_found(m, q, linkto)
new_content = add_link(m, replacement, content)
if linkto:
m_end = m.end()
re_extend = re.compile(m.re.pattern + r"\w*\b", re.I)
m = re_extend.search(content)
if m and m.end() > m_end:
replacement += content[m_end : m.end()]
new_content = add_link(m, replacement, content)
return (new_content, replacement, found_text_to_link)
def find_link_in_text(q, content):
(new_content, replacement) = find_link_in_chunk(q, content)
if replacement:
return (new_content, replacement)
raise NoMatch
def find_link_in_content(q, content, linkto=None):
if linkto:
try:
return find_link_in_content(linkto, content)
except NoMatch:
pass
replacement = None
new_content = ""
link_replace = False
for header, section_text in section_iter(content):
if header:
new_content += header
for token_type, text in parse_cite(section_text):
if token_type == "text" and not replacement:
try:
(new_text, replacement, replaced_text) = find_link_in_chunk(
q, text, linkto=linkto
)
except LinkReplace:
link_replace = True
if replacement:
text = new_text
new_content += text
if replacement:
return (new_content, replacement, replaced_text)
raise LinkReplace if link_replace else NoMatch
def find_link_and_section(q, content, linkto=None):
if linkto:
try:
return find_link_and_section(linkto, content)
except NoMatch:
pass
sections = list(section_iter(content))
replacement = None
search_for_link = mk_link_matcher(q)
found: dict[str, str | int] = {}
for section_num, (header, section_text) in enumerate(sections):
new_content = ""
if header:
new_content += header
for token_type, text in parse_cite(section_text):
if token_type == "text" and not replacement:
new_text = ""
for token_type2, text2 in parse_links(text):
if token_type2 == "link" and not replacement:
link_text = text2[2:-2]
if "|" in link_text:
link_dest, link_text = link_text.split("|", 1)
else:
link_dest = None
m = search_for_link(link_text)
if m:
if link_dest:
found["link_dest"] = link_dest
found["link_text"] = link_text
replacement = match_found(m, q, None)
text2 = add_link(m, replacement, link_text)
new_text += text2
if replacement:
text = new_text
else:
m = search_for_link(text)
if m:
replacement = match_found(m, q, linkto)
text = add_link(m, replacement, text)
new_content += text
if replacement:
found.update(
{
"section_num": section_num,
"section_text": new_content,
"old_text": (header or "") + section_text,
"replacement": replacement,
}
)
return found
raise NoMatch
def find_refs(text: str) -> list[str]:
"""Find <ref> in wikitext."""
refs = re.findall("<ref(?:[^>]*)>(.+?)</ref>", text)
print(refs)
return refs
def new_link_is_in_ref(replacement: str, text: str) -> bool:
"""Is the new link in a <ref>."""
link = f"[[{replacement}]]"
return any(link in ref for ref in find_refs(text))
def get_match(q: str, title: str, linkto: str | None) -> dict[str, typing.Any]:
"""Get match."""
rev = get_revision_info(title)
found: dict[str, typing.Any] = find_link_and_section(q, rev["content"], linkto)
assert not new_link_is_in_ref(found["replacement"], found["section_text"])
found["revid"] = rev["revid"]
found["pageid"] = rev["pageid"]
found["section_text"] += get_subsections(rev["content"], found["section_num"])
return found
def get_diff(q: str, title: str, linkto: str | None) -> dict[str, typing.Any]:
"""Get diff."""
content, timestamp = get_content_and_timestamp(title)
found: dict[str, typing.Any] = find_link_and_section(q, content, linkto)
if new_link_is_in_ref(found["replacement"], found["section_text"]):
raise NoMatch
section_text = found["section_text"] + get_subsections(
content, found["section_num"]
)
found["diff"] = call_get_diff(title, found["section_num"], section_text)
return found

101
add_links/mediawiki_api.py Normal file
View file

@ -0,0 +1,101 @@
"""Interface with the mediawiki API."""
import typing
from pprint import pprint
from typing import Any, cast
from . import wikidata_oauth
wiki_hostname = "en.wikipedia.org"
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
user_agent = "add-links/0.1"
def parse_page(enwiki: str) -> dict[str, Any]:
"""Call mediawiki parse API for given article."""
params: dict[str, str | int] = {
"action": "parse",
"format": "json",
"formatversion": 2,
"disableeditsection": 1,
"page": enwiki,
"prop": "text|links|headhtml",
"disabletoc": 1,
}
parse: dict[str, Any] = call(params)["parse"]
return parse
def call(params: dict[str, str | int]) -> dict[str, typing.Any]:
"""Make GET request to mediawiki API."""
data = wikidata_oauth.api_post_request(params)
return cast(dict[str, Any], data.json())
def article_exists(title: str) -> bool:
"""Get article text."""
params: dict[str, str | int] = {
"action": "query",
"format": "json",
"formatversion": 2,
"titles": title,
}
return not call(params)["query"]["pages"][0].get("missing")
def get_content(title: str) -> tuple[str, int]:
"""Get article text."""
params: dict[str, str | int] = {
"action": "query",
"format": "json",
"formatversion": 2,
"prop": "revisions|info",
"rvprop": "content|timestamp|ids",
"titles": title,
}
data = call(params)
rev = data["query"]["pages"][0]["revisions"][0]
content: str = rev["content"]
revid: int = int(rev["revid"])
return content, revid
def compare(title: str, new_text: str) -> str:
"""Generate a diff for the new article text."""
params: dict[str, str | int] = {
"format": "json",
"formatversion": 2,
"action": "compare",
"fromtitle": title,
"toslots": "main",
"totext-main": new_text,
"prop": "diff",
}
diff: str = call(params)["compare"]["body"]
return diff
def edit_page(
pageid: int, section: str | int, text: str, summary: str, baserevid: str, token: str
) -> str:
"""Edit a page on Wikipedia."""
params: dict[str, str | int] = {
"format": "json",
"formatversion": 2,
"action": "edit",
"pageid": pageid,
"text": text,
"baserevid": baserevid,
"token": token,
"nocreate": 1,
"summary": summary,
"section": section,
}
ret = call(params)
if "edit" not in ret:
print("params")
pprint(params)
print()
pprint(ret)
return typing.cast(str, ret["edit"])

View file

@ -0,0 +1,48 @@
"""Interface with the mediawiki API."""
from typing import Any
import requests
wiki_hostname = "en.wikipedia.org"
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
user_agent = "dab-mechanic/0.1"
def parse_page(enwiki: str) -> dict[str, Any]:
"""Call mediawiki parse API for given article."""
params: dict[str, str | int] = {
"action": "parse",
"format": "json",
"formatversion": 2,
"disableeditsection": 1,
"page": enwiki,
"prop": "text|links|headhtml",
"disabletoc": 1,
}
parse: dict[str, Any] = get(params)["parse"]
return parse
def get(params: dict[str, str | int]) -> dict[str, Any]:
"""Make GET request to mediawiki API."""
data: dict[str, Any] = requests.get(
wiki_api_php, headers={"User-Agent": user_agent}, params=params
).json()
return data
def get_content(title: str) -> str:
"""Get article text."""
params: dict[str, str | int] = {
"action": "query",
"format": "json",
"formatversion": 2,
"prop": "revisions|info",
"rvprop": "content|timestamp",
"titles": title,
}
data = get(params)
rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
return rev

115
add_links/util.py Normal file
View file

@ -0,0 +1,115 @@
"""Util functions."""
import re
import urllib
from typing import Any
# util functions that don't access the network
namespaces = {
ns.casefold()
for ns in (
"Special",
"Media",
"Talk",
"Template",
"Portal",
"Portal talk",
"Book",
"Book talk",
"Template talk",
"Draft",
"Draft talk",
"Help",
"Help talk",
"Category",
"Category talk",
"User",
"Gadget",
"Gadget talk",
"Gadget definition",
"Gadget definition talk",
"Topic",
"User talk",
"Wikipedia",
"Education Program",
"Education Program talk",
"Wikipedia talk",
"File",
"File talk",
"TimedText",
"TimedText talk",
"MediaWiki",
"Module",
"Module talk",
"MediaWiki talk",
)
}
re_space_or_dash = re.compile("[ -]")
def is_title_case(phrase: str) -> bool:
"""Is a given phrase is in Title Case."""
return all(
term[0].isupper() and term[1:].islower()
for term in re_space_or_dash.split(phrase)
if term and term[0].isalpha()
)
def urlquote(value: str) -> str:
"""Prepare string for use in URL param."""
return urllib.parse.quote_plus(value.encode("utf-8"))
def strip_parens(q: str) -> str:
"""Remove a word in parenthesis from the end of a string."""
m = re.search(r" \(.*?\)$", q)
return q[: m.start()] if m else q
def starts_with_namespace(title: str) -> bool:
"""Check if a title starts with a namespace."""
return ":" in title and title.split(":", 1)[0].casefold() in namespaces
def is_disambig(doc: dict[str, Any]) -> bool:
"""Is a this a disambiguation page."""
return any(
"disambig" in t
or t.endswith("dis")
or "given name" in t
or t == "template:surname"
for t in (t["title"].lower() for t in doc.get("templates", []))
)
def norm(s: str) -> str:
"""Normalise string."""
s = re.sub(r"\W", "", s).lower()
return s[:-1] if s and s[-1] == "s" else s
def case_flip(s: str) -> str:
"""Switch case of character."""
if s.islower():
return s.upper()
if s.isupper():
return s.lower()
return s
def case_flip_first(s: str) -> str:
"""Switch case of first character in string."""
return case_flip(s[0]) + s[1:]
def lc_alpha(s: str) -> str:
"""Lower case alphabetic characters in string."""
return "".join(c.lower() for c in s if c.isalpha())
def wiki_space_norm(s: str) -> str:
"""Normalise article title."""
return s.replace("_", " ").strip()

View file

@ -0,0 +1,98 @@
import typing
import urllib
from typing import cast
from flask import current_app, session
from requests_oauthlib import OAuth1Session
wiki_hostname = "en.wikipedia.org"
api_url = f"https://{wiki_hostname}/w/api.php"
def get_edit_proxy() -> dict[str, str]:
"""Retrieve proxy information from config."""
edit_proxy = current_app.config.get("EDIT_PROXY")
if edit_proxy:
return {"http": edit_proxy, "https": edit_proxy}
else:
return {}
def api_post_request(params: dict[str, str | int]):
"""HTTP Post using Oauth."""
app = current_app
# url = "https://www.wikidata.org/w/api.php"
client_key = app.config["CLIENT_KEY"]
client_secret = app.config["CLIENT_SECRET"]
oauth = OAuth1Session(
client_key,
client_secret=client_secret,
resource_owner_key=session["owner_key"],
resource_owner_secret=session["owner_secret"],
)
proxies = get_edit_proxy()
return oauth.post(api_url, data=params, timeout=4, proxies=proxies)
def raw_request(params: typing.Mapping[str, str | int]):
"""Low-level API request."""
app = current_app
# url = "https://www.wikidata.org/w/api.php?" + urlencode(params)
client_key = app.config["CLIENT_KEY"]
client_secret = app.config["CLIENT_SECRET"]
oauth = OAuth1Session(
client_key,
client_secret=client_secret,
resource_owner_key=session["owner_key"],
resource_owner_secret=session["owner_secret"],
)
proxies = get_edit_proxy()
return oauth.get(
api_url + "?" + urllib.parse.urlencode(params), timeout=4, proxies=proxies
)
def api_request(params: typing.Mapping[str, str | int]) -> dict[str, typing.Any]:
"""Make an API request with OAuth."""
r = raw_request(params)
try:
return cast(dict[str, typing.Any], r.json())
except Exception:
print("text")
print(r.text)
print("---")
raise
def get_token() -> str:
"""Get CSRF tokebn from MediaWiki API."""
params: dict[str, str | int] = {
"action": "query",
"meta": "tokens",
"format": "json",
"formatversion": 2,
}
reply = api_request(params)
token: str = reply["query"]["tokens"]["csrftoken"]
return token
def userinfo_call() -> typing.Mapping[str, typing.Any]:
"""Request user information via OAuth."""
params = {"action": "query", "meta": "userinfo", "format": "json"}
return api_request(params)
def get_username() -> None | str:
"""Get the username or None if not logged in."""
if "owner_key" not in session:
return None # not authorized
if "username" not in session:
reply = userinfo_call()
if "query" not in reply:
return None
session["username"] = reply["query"]["userinfo"]["name"]
return cast(str, session["username"])

201
add_links/wikipedia.py Normal file
View file

@ -0,0 +1,201 @@
from collections import defaultdict
from typing import Any, Iterator, Optional, TypedDict
import flask
import lxml.html
from . import mediawiki_api
disambig_templates = [
"Template:Disambiguation",
"Template:Airport disambiguation",
"Template:Biology disambiguation",
"Template:Call sign disambiguation",
"Template:Caselaw disambiguation",
"Template:Chinese title disambiguation",
"Template:Disambiguation cleanup",
"Template:Genus disambiguation",
"Template:Hospital disambiguation",
"Template:Human name disambiguation",
"Template:Human name disambiguation cleanup",
"Template:Letter-number combination disambiguation",
"Template:Mathematical disambiguation",
"Template:Military unit disambiguation",
"Template:Music disambiguation",
"Template:Number disambiguation",
"Template:Opus number disambiguation",
"Template:Phonetics disambiguation",
"Template:Place name disambiguation",
"Template:Portal disambiguation",
"Template:Road disambiguation",
"Template:School disambiguation",
"Template:Species Latin name abbreviation disambiguation",
"Template:Species Latin name disambiguation",
"Template:Station disambiguation",
"Template:Synagogue disambiguation",
"Template:Taxonomic authority disambiguation",
"Template:Taxonomy disambiguation",
"Template:Template disambiguation",
"Template:WoO number disambiguation",
]
def link_params(enwiki: str) -> dict[str, str | int]:
"""Parameters for finding article links from the API."""
params: dict[str, str | int] = {
"action": "query",
"format": "json",
"formatversion": 2,
"titles": enwiki,
"generator": "links",
"gpllimit": "max",
"gplnamespace": 0,
"tllimit": "max",
"redirects": 1,
"tlnamespace": 10,
"tltemplates": "|".join(disambig_templates),
"prop": "templates",
}
return params
def needs_disambig(link: dict[str, Any]) -> bool:
"""Is this a disambiguation link."""
return bool(
not link["title"].endswith(" (disambiguation)") and link.get("templates")
)
def get_article_links(enwiki: str) -> list[str]:
"""Get links that appear in this article."""
params: dict[str, str | int] = link_params(enwiki)
links: set[str] = set()
redirects = defaultdict(set)
while True:
data = mediawiki_api.get(params)
pages = data["query"].pop("pages")
for r in data["query"].pop("redirects"):
redirects[r["to"]].add(r["from"])
links.update(page["title"] for page in pages if needs_disambig(page))
if "continue" not in data:
break
params["gplcontinue"] = data["continue"]["gplcontinue"]
for link in set(links):
if link in redirects:
links.update(redirects[link])
return list(links)
# return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
def get_article_html(enwiki: str) -> str:
"""Parse article wikitext and return HTML."""
text: str = mediawiki_api.parse_page(enwiki)["text"]
return text
class DabItem(TypedDict):
"""Represent a disabiguation page."""
num: int
title: str
html: str
def delete_toc(root: lxml.html.HtmlElement) -> None:
"""Delete table of contents from article HTML."""
for toc in root.findall(".//div[@class='toc']"):
toc.getparent().remove(toc)
def get_dab_html(dab_num: int, title: str) -> str:
"""Parse dab page and rewrite links."""
dab_html = get_article_html(title)
root = lxml.html.fromstring(dab_html)
delete_toc(root)
element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")}
for a in root.findall(".//a[@href]"):
href: str | None = a.get("href")
if not href:
continue
if not href.startswith("#"):
a.set("href", "#")
a.set("onclick", f"return select_dab(this, {dab_num})")
continue
destination_element = element_id_map[href[1:]]
assert destination_element is not None
destination_element.set("id", f"{dab_num}{href[1:]}")
a.set("href", f"#{dab_num}{href[1:]}")
html: str = lxml.html.tostring(root, encoding=str)
return html
class Article:
"""Current article we're working on."""
def __init__(self, enwiki: str) -> None:
"""Make a new Article object."""
self.enwiki = enwiki.replace("_", " ")
self.links = get_article_links(enwiki)
self.dab_list: list[DabItem] = []
self.dab_lookup: dict[int, str] = {}
self.dab_order: list[str] = []
self.parse: Optional[dict[str, Any]] = None
def save_endpoint(self) -> str:
"""Endpoint for saving changes."""
href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_"))
return href
def load(self) -> None:
"""Load parsed article HTML."""
self.parse = mediawiki_api.parse_page(self.enwiki)
self.root = lxml.html.fromstring(self.parse.pop("text"))
def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
"""Disambiguation links that need fixing."""
seen = set()
for a in self.root.findall(".//a[@href]"):
title = a.get("title")
if title is None or title not in self.links:
continue
a.set("class", "disambig")
if title in seen:
continue
seen.add(title)
yield a, title
def process_links(self) -> None:
"""Process links in parsed wikitext."""
for dab_num, (a, title) in enumerate(self.iter_links()):
a.set("id", f"dab-{dab_num}")
dab: DabItem = {
"num": dab_num,
"title": title,
"html": get_dab_html(dab_num, title),
}
self.dab_list.append(dab)
self.dab_order.append(title)
self.dab_lookup[dab_num] = title
def get_html(self) -> str:
"""Return the processed article HTML."""
html: str = lxml.html.tostring(self.root, encoding=str)
return html

114
cmdline.py Executable file
View file

@ -0,0 +1,114 @@
#!/usr/bin/python3
import collections
import json
import re
import sys
import time
import typing
from add_links import api
# from_title = sys.argv[1]
re_disambig = re.compile(r"^(.*) \((.*)\)$")
def article_title_to_search_query(title: str) -> str:
"""Convert from article title to search query string."""
m = re_disambig.match(title)
return f'"{m.group(1)}" AND "{m.group(2)}"' if m else f'"{title}"'
def run_search(q: str, limit: int | str = "max") -> dict[str, typing.Any]:
"""Search Wikipedia."""
params = {"list": "search", "srwhat": "text", "srlimit": limit, "srsearch": q}
return typing.cast(dict[str, typing.Any], api.api_get(params)["query"])
def search_no_link(q: str) -> tuple[int, list[dict[str, str | int]]]:
"""Search for mentions of article title with no link included."""
query = run_search(article_title_to_search_query(q) + f' -linksto:"{q}"', "max")
totalhits = query["searchinfo"]["totalhits"]
results = query["search"]
return (totalhits, results)
def search_count(q: str) -> int:
"""How often does this article title appear in Wikipedia."""
query = run_search(article_title_to_search_query(q), limit=0)
return typing.cast(int, query["searchinfo"]["totalhits"]) - 1
def search_count_with_link(q: str) -> int:
"""How often does this article title appear in Wikipedia."""
query = run_search(article_title_to_search_query(q) + f' linksto:"{q}"', limit=0)
return typing.cast(int, query["searchinfo"]["totalhits"])
def parse_contribs() -> list[tuple[str, int]]:
re_comment = re.compile(r"^link \[\[(.*)\]\] using")
links: collections.Counter[str] = collections.Counter()
for line in open("../wikipedia-contribs/contribs"):
if (
'"comment": "link ' not in line
or "using [[User:Edward/Find link|Find link]]" not in line
):
continue
comment = json.loads(line)["comment"]
m = re_comment.match(comment)
if not m:
continue
link = m.group(1)
if "|" not in link:
links[link] += 1
return links.most_common(200)
with open("examples") as f:
seen = {json.loads(line)["title"] for line in f}
out = open("examples", "a")
for from_title, num in parse_contribs():
if from_title in seen:
continue
count = search_count(from_title)
count_with_link = search_count_with_link(from_title)
ratio = float(count_with_link) / float(count)
print(from_title, count, count_with_link, f"{ratio:.1%}")
print(
json.dumps(
{"title": from_title, "total": count, "with_links": count_with_link}
),
file=out,
)
out.flush()
time.sleep(0.1)
out.close()
sys.exit(0)
count = search_count(from_title)
count_with_link = search_count_with_link(from_title)
ratio = float(count_with_link) / float(count)
print(count, count_with_link, f"{ratio:.1%}")
sys.exit(0)
totalhits, search_hits = search_no_link(from_title)
for hit in search_hits:
print(" ", hit)
print(count, count_with_link, f"{ratio:.1%}", totalhits, len(search_hits))
# ret = core.do_search(from_title)
# print(ret)

23
frontend/.eslintrc.js Normal file
View file

@ -0,0 +1,23 @@
module.exports = {
"env": {
"browser": true,
"es6": true
},
"extends": [
"plugin:vue/essential",
"standard"
],
"globals": {
"Atomics": "readonly",
"SharedArrayBuffer": "readonly"
},
"parserOptions": {
"ecmaVersion": 14,
"sourceType": "module"
},
"plugins": [
"vue"
],
"rules": {
}
};

55
frontend/App.vue Normal file
View file

@ -0,0 +1,55 @@
<template>
Hello world: {{ title }}
<div v-for="hit in this.hits" class="mt-3">
<div><strong>{{ hit.title }}</strong> ({{ hit.wordcount }} words)</div>
<div v-html="hit.snippet"></div>
<table v-html="hit.diff"></table>
<div>replacement: {{ hit.replacement }}</div>
</div>
</template>
<script>
import axios from "redaxios";
export default {
props: {
title: String,
api_base_url: String,
},
data() {
return {
hits: [],
};
},
computed: {
},
watch: {
},
methods: {
api_call(endpoint, options) {
var url = `${this.api_base_url}/${endpoint}`;
return axios.get(url, options).catch(this.show_api_error_modal);
},
add_hit(hit) {
var params = { link_from: hit.title, link_to: this.title };
this.api_call("valid_hit", { params: params}).then((response) => {
if (response.data.valid) {
hit.diff = response.data.diff
hit.replacement = response.data.replacement
this.hits.push(hit);
}
});
}
},
mounted() {
var params = { title: this.title }
this.api_call("hits", { params: params}).then((response) => {
response.data.hits.forEach((hit) => { this.add_hit(hit) });
});
}
};
</script>
<style>
</style>

7
frontend/entry.js Normal file
View file

@ -0,0 +1,7 @@
import {createApp} from 'vue';
import App from './App.vue';
export default function(props) {
const app = createApp(App, props).mount('#app');
return app;
}

18
package.json Normal file
View file

@ -0,0 +1,18 @@
{
"name": "add-links",
"version": "0.0.1",
"scripts": {
"dev": "vite",
"build": "vite build"
},
"dependencies": {
"bootstrap": "^5.2.3",
"vue": "^3.3.4"
},
"devDependencies": {
"@vitejs/plugin-vue": "^4.2.3",
"eslint": "^8.41.0",
"eslint-plugin-vue": "^9.13.0",
"vite": "^4.3.8"
}
}

10
templates/all_done.html Normal file
View file

@ -0,0 +1,10 @@
{% extends "base.html" %}
{% block title %}Index{% endblock %}
{% block content %}
<div class="container">
<h1>All done</h1>
<div><a href="{{ url_for('index') }}">back to index </a></div>
</div>
{% endblock %}

56
templates/article.html Normal file
View file

@ -0,0 +1,56 @@
{% extends "base.html" %}
{% block title %}{{ title }}{% endblock %}
{% block style %}
<style>
span.exact { padding: 2px; background: green; color: white; font-weight: bold; }
span.nomatch { padding: 2px; background: red; color: white; font-weight: bold; }
span.case_mismatch { padding: 2px; background: orange; color: white; font-weight: bold; }
span.searchmatch { font-weight: bold; }
table.diff,td.diff-otitle,td.diff-ntitle{background-color:white}
td.diff-otitle,td.diff-ntitle{text-align:center}
td.diff-marker{text-align:right;font-weight:bold;font-size:1.25em}
td.diff-lineno{font-weight:bold}
td.diff-addedline,td.diff-deletedline,td.diff-context{font-size:88%;vertical-align:top;white-space:-moz-pre-wrap;white-space:pre-wrap}
td.diff-addedline,td.diff-deletedline{border-style:solid;border-width:1px 1px 1px 4px;border-radius:0.33em}
td.diff-addedline{border-color:#a3d3ff}
td.diff-deletedline{border-color:#ffe49c}
td.diff-context{background:#f3f3f3;color:#333333;border-style:solid;border-width:1px 1px 1px 4px;border-color:#e6e6e6;border-radius:0.33em}
.diffchange{font-weight:bold;text-decoration:none}
table.diff{border:none;width:98%;border-spacing:4px; table-layout:fixed}
td.diff-addedline .diffchange,td.diff-deletedline .diffchange{border-radius:0.33em;padding:0.25em 0}
td.diff-addedline .diffchange{background:#d8ecff}
td.diff-deletedline .diffchange{background:#feeec8}
table.diff td{padding:0.33em 0.66em}
table.diff col.diff-marker{width:2%}
table.diff col.diff-content{width:48%}
table.diff td div{ word-wrap:break-word; overflow:auto}
</style>
{% endblock %}
{% block content %}
<div class="container">
<h1>{{ self.title() }}</h1>
<form>
<input name="q">
<input type="submit" value="search">
</form>
<div id="app"></div>
</div>
<script type="module">
import main from {{ url_for('static', filename='add_links.es.js') | tojson }};
const props = {
title: {{ title | tojson }},
api_base_url: "/api/1"
}
main(props);
</script>
{% endblock %}

66
templates/article2.html Normal file
View file

@ -0,0 +1,66 @@
{% extends "base.html" %}
{% block title %}{{ title }}{% endblock %}
{% block style %}
<style>
span.exact { padding: 2px; background: green; color: white; font-weight: bold; }
span.nomatch { padding: 2px; background: red; color: white; font-weight: bold; }
span.case_mismatch { padding: 2px; background: orange; color: white; font-weight: bold; }
span.searchmatch { font-weight: bold; }
table.diff,td.diff-otitle,td.diff-ntitle{background-color:white}
td.diff-otitle,td.diff-ntitle{text-align:center}
td.diff-marker{text-align:right;font-weight:bold;font-size:1.25em}
td.diff-lineno{font-weight:bold}
td.diff-addedline,td.diff-deletedline,td.diff-context{font-size:88%;vertical-align:top;white-space:-moz-pre-wrap;white-space:pre-wrap}
td.diff-addedline,td.diff-deletedline{border-style:solid;border-width:1px 1px 1px 4px;border-radius:0.33em}
td.diff-addedline{border-color:#a3d3ff}
td.diff-deletedline{border-color:#ffe49c}
td.diff-context{background:#f3f3f3;color:#333333;border-style:solid;border-width:1px 1px 1px 4px;border-color:#e6e6e6;border-radius:0.33em}
.diffchange{font-weight:bold;text-decoration:none}
table.diff{border:none;width:98%;border-spacing:4px; table-layout:fixed}
td.diff-addedline .diffchange,td.diff-deletedline .diffchange{border-radius:0.33em;padding:0.25em 0}
td.diff-addedline .diffchange{background:#d8ecff}
td.diff-deletedline .diffchange{background:#feeec8}
table.diff td{padding:0.33em 0.66em}
table.diff col.diff-marker{width:2%}
table.diff col.diff-content{width:48%}
table.diff td div{ word-wrap:break-word; overflow:auto}
</style>
{% endblock %}
{% block content %}
<div class="container">
<h1>{{ self.title() }}</h1>
<form action="{{ url_for("index") }}">
<input name="q">
<input type="submit" value="search">
</form>
<div>Username: {{ g.user }}</div>
<div><a href="https://en.wikipedia.org/wiki/{{ title }}" target="_blank">view article</a></div>
<div><a href="{{ url_for('index') }}">back to index </a></div>
<div>total: {{ total }}</div>
<div>with link: {{ with_link }}</div>
<div>ratio: {{ "{:.1%}".format(with_link / total) }}</div>
<div>hit: {{ hit }}</div>
<div>replacement: {{ found.replacement }}</div>
<div>section: {{ found.section }}</div>
<table>
{{ diff | safe }}
</table>
<form method="POST">
<input type="hidden" name="hit" value="{{ hit.title }}">
<div class="my-3">
<input type="submit" class="btn btn-primary" value="save"/>
<a href="{{url_for("article_page", url_title=url_title, after=hit["title"])}}" class="btn btn-primary">skip</a>
</div>
</form>
</div>
{% endblock %}

22
templates/base.html Normal file
View file

@ -0,0 +1,22 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<link href="{{ url_for("static", filename="bootstrap/css/bootstrap.min.css") }}" rel="stylesheet">
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>
{% block title %}{% endblock %}
</title>
{% block style %}{% endblock %}
</head>
<body>
{% block content %}{% endblock %}
<script src="{{ url_for("static", filename="bootstrap/js/bootstrap.bundle.min.js")}}></script>
{% block script %}{% endblock %}
</body>
</html>

25
templates/index.html Normal file
View file

@ -0,0 +1,25 @@
{% extends "base.html" %}
{% block title %}Index{% endblock %}
{% block content %}
<div class="container">
<h1>Index</h1>
<form>
<input name="q">
<input type="submit" value="search">
</form>
<div>Username: {{ g.user }}</div>
<table class="table w-auto">
{% for item in examples %}
<tr>
<td><a href="{{ article_url(item.title) }}">{{ item.title }}</a></td>
<td>{{ item.total }}</td>
<td>{{ "{:.1%}".format(item.with_links / item.total) }}</td>
</tr>
{% endfor %}
</table>
</div>
{% endblock %}

10
templates/save_done.html Normal file
View file

@ -0,0 +1,10 @@
{% extends "base.html" %}
{% block title %}Index{% endblock %}
{% block content %}
<div class="container">
<h1>Save done</h1>
<div>Save is complete.</div>
</div>
{% endblock %}

17
vite.config.js Normal file
View file

@ -0,0 +1,17 @@
import { defineConfig } from 'vite'
import vue from '@vitejs/plugin-vue'
import path from 'path'
export default defineConfig({
plugins: [vue()],
define: {
'process.env.NODE_ENV': JSON.stringify('production'),
},
build: {
lib: {
entry: path.resolve(__dirname, 'frontend/entry.js'),
name: 'AddLinks',
fileName: (format) => `add_links.${format}.js`,
},
},
})

362
web_view.py Executable file
View file

@ -0,0 +1,362 @@
#!/usr/bin/python3
import html
import itertools
import json
import re
import typing
import flask
import werkzeug
from requests_oauthlib import OAuth1Session
from werkzeug.wrappers.response import Response
from add_links import api, core, mediawiki_api, wikidata_oauth
from add_links.match import NoMatch, get_diff, get_match
app = flask.Flask(__name__)
app.config.from_object("config.default")
app.debug = True
wiki_hostname = "en.wikipedia.org"
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
wiki_index_php = f"https://{wiki_hostname}/w/index.php"
class Hit(typing.TypedDict):
"""Candidate articles."""
ns: int
title: str
pageid: int
size: int
wordcount: int
snippet: str
timestamp: str
re_disambig = re.compile(r"^(.*) \((.*)\)$")
def load_examples() -> list[dict[str, str | int]]:
"""Load examples."""
return [json.loads(line) for line in open("examples")]
def article_title_to_search_query(title: str) -> str:
"""Convert from article title to search query string."""
m = re_disambig.match(title)
return f'"{m.group(1)}" AND "{m.group(2)}"' if m else f'"{title}"'
def run_search(q: str, limit: int | str = "max") -> dict[str, typing.Any]:
"""Search Wikipedia."""
params = {"list": "search", "srwhat": "text", "srlimit": limit, "srsearch": q}
return typing.cast(dict[str, typing.Any], api.api_get(params)["query"])
def article_url(title: str) -> str:
"""URL for search page."""
return flask.url_for("article_page", url_title=title.replace(" ", "_"))
def search_count(q: str) -> int:
"""How often does this article title appear in Wikipedia."""
query = run_search(article_title_to_search_query(q), limit=0)
return typing.cast(int, query["searchinfo"]["totalhits"]) - 1
def search_count_with_link(q: str) -> int:
"""How often does this article title appear in Wikipedia."""
query = run_search(article_title_to_search_query(q) + f' linksto:"{q}"', limit=0)
return typing.cast(int, query["searchinfo"]["totalhits"])
def search_no_link(q: str) -> tuple[int, list[Hit]]:
"""Search for mentions of article title with no link included."""
query = run_search(article_title_to_search_query(q) + f' -linksto:"{q}"', "max")
totalhits = query["searchinfo"]["totalhits"]
results = query["search"]
return (totalhits, results)
@app.before_request
def global_user() -> None:
"""Make username available everywhere."""
flask.g.user = wikidata_oauth.get_username()
@app.route("/")
def index() -> str | Response:
"""Index page."""
if "oauth_verifier" in flask.request.args and "oauth_token" in flask.request.args:
return flask.redirect(flask.url_for("oauth_callback", **flask.request.args))
examples = load_examples()
examples.sort(
key=lambda i: float(i["with_links"]) / float(i["total"]), reverse=True
)
if q := flask.request.args.get("q"):
if q_trimmed := q.strip():
return flask.redirect(article_url(q_trimmed))
return flask.render_template(
"index.html", examples=examples, article_url=article_url
)
def case_flip(s: str) -> str:
"""Switch case of character."""
if s.islower():
return s.upper()
if s.isupper():
return s.lower()
return s
def case_flip_first(s: str) -> str:
"""Switch case of first character in string."""
return case_flip(s[0]) + s[1:]
def tidy_snippet(snippet: str) -> str:
"""Remove HTML from snippet."""
snippet = snippet.replace("\u2013", "-")
snippet = snippet.replace("</span>", "")
snippet = snippet.replace('<span class="searchmatch">', "")
return html.unescape(snippet)
@app.route("/oauth/start")
def start_oauth() -> Response:
"""Start OAuth."""
next_page = flask.request.args.get("next")
if next_page:
flask.session["after_login"] = next_page
client_key = app.config["CLIENT_KEY"]
client_secret = app.config["CLIENT_SECRET"]
request_token_url = wiki_index_php + "?title=Special%3aOAuth%2finitiate"
oauth = OAuth1Session(client_key, client_secret=client_secret, callback_uri="oob")
fetch_response = oauth.fetch_request_token(request_token_url)
flask.session["owner_key"] = fetch_response.get("oauth_token")
flask.session["owner_secret"] = fetch_response.get("oauth_token_secret")
base_authorization_url = f"https://{wiki_hostname}/wiki/Special:OAuth/authorize"
authorization_url = oauth.authorization_url(
base_authorization_url, oauth_consumer_key=client_key
)
return flask.redirect(authorization_url)
@app.route("/oauth/callback", methods=["GET"])
def oauth_callback() -> werkzeug.wrappers.response.Response:
"""Oauth callback."""
client_key = app.config["CLIENT_KEY"]
client_secret = app.config["CLIENT_SECRET"]
oauth = OAuth1Session(
client_key,
client_secret=client_secret,
resource_owner_key=flask.session["owner_key"],
resource_owner_secret=flask.session["owner_secret"],
)
oauth_response = oauth.parse_authorization_response(flask.request.url)
verifier = oauth_response.get("oauth_verifier")
access_token_url = wiki_index_php + "?title=Special%3aOAuth%2ftoken"
oauth = OAuth1Session(
client_key,
client_secret=client_secret,
resource_owner_key=flask.session["owner_key"],
resource_owner_secret=flask.session["owner_secret"],
verifier=verifier,
)
oauth_tokens = oauth.fetch_access_token(access_token_url)
flask.session["owner_key"] = oauth_tokens.get("oauth_token")
flask.session["owner_secret"] = oauth_tokens.get("oauth_token_secret")
print("login successful")
next_page = flask.session.get("after_login")
return flask.redirect(next_page if next_page else flask.url_for("index"))
@app.route("/oauth/disconnect")
def oauth_disconnect() -> werkzeug.wrappers.response.Response:
"""Disconnect OAuth."""
for key in "owner_key", "owner_secret", "username", "after_login":
if key in flask.session:
del flask.session[key]
return flask.redirect(flask.url_for("index"))
def match_type(q: str, snippet: str) -> str | None:
"""Discover match type, ''exact', 'case_mismatch' or None.
>>> match_type('foo', 'foo')
'exact'
>>> match_type('foo', 'bar') is None
True
>>> match_type('bar', 'foo bar baz')
'exact'
>>> match_type('clean coal technology', 'foo clean coal technologies baz')
'exact'
>>> match_type('bar', 'foo Bar baz')
'exact'
>>> match_type('bar', 'foo BAR baz')
'case_mismatch'
>>> match_type('foo-bar', 'aa foo-bar cc')
'exact'
>>> match_type(u'foo\u2013bar', 'aa foo-bar cc')
'exact'
"""
q = q.replace("\u2013", "-")
snippet = tidy_snippet(snippet)
if q in snippet or case_flip_first(q) in snippet:
return "exact"
match = None
if q.lower() in snippet.lower():
match = "case_mismatch"
if match != "exact" and q.endswith("y"):
if q[:-1] in snippet or case_flip_first(q[:-1]) in snippet:
return "exact"
elif match is None:
if q[:-1].lower() in snippet.lower():
match = "case_mismatch"
return match
class NoGoodHit(Exception):
pass
def get_best_hit(title: str, hits: list[Hit]) -> tuple[Hit, dict[str, typing.Any]]:
"""Find the best hit within the search results."""
for hit in hits:
if hit["title"].lower() == title.lower():
continue
if match_type(title, hit["snippet"]) != "exact":
continue
try:
print(f'get diff: {hit["title"]}, {title}')
found = get_diff(title, hit["title"], None)
except NoMatch:
print("no match")
continue
return (hit, found)
raise NoGoodHit
@app.route("/<path:url_title>", methods=["GET", "POST"])
def article_page(url_title: str) -> str | Response:
"""Article page."""
from_title = url_title.replace("_", " ").strip()
if flask.request.method == "POST":
hit_title = flask.request.form["hit"]
do_save(from_title, hit_title)
return flask.redirect(
flask.url_for("article_page", url_title=url_title, after=hit_title)
)
total = search_count(from_title)
with_link = search_count_with_link(from_title)
no_link_count, hits = search_no_link(from_title)
after = flask.request.args.get("after")
if after:
print(after)
hits_iter = itertools.dropwhile(lambda hit: hit["title"] != after, hits)
skip = next(hits_iter, None)
if skip:
hits = list(hits_iter)
try:
hit, found = get_best_hit(from_title, hits)
except NoGoodHit:
return flask.render_template("all_done.html")
return flask.render_template(
"article2.html",
title=from_title,
total=total,
with_link=with_link,
hit=hit,
replacement=found["replacement"],
diff=found["diff"],
found=found,
url_title=url_title,
)
def do_save(title: str, hit_title: str) -> str:
"""Update page on Wikipedia."""
token = wikidata_oauth.get_token()
found = get_match(title, hit_title, None)
summary = (
f"link [[{found['replacement']}]] using [[:en:User:Edward/Find link|Find link]]"
)
edit = mediawiki_api.edit_page(
pageid=found["pageid"],
section=found["section_num"],
text=found["section_text"],
summary=summary,
baserevid=found["revid"],
token=token,
)
return edit
@app.route("/saved")
def save_done() -> str:
"""Save complete."""
return flask.render_template("save_done.html")
@app.route("/api/1/hits")
def api_hits() -> werkzeug.wrappers.response.Response:
"""Return canidates for the given article title."""
title = flask.request.args.get("title")
assert title
ret = core.do_search(title)
return flask.jsonify(title=title, hits=ret["results"])
# mock_hits: list[Hit] = json.load(open("sample.json"))
# return flask.jsonify(title=title, hits=mock_hits)
@app.route("/api/1/valid_hit")
def api_valid_hit() -> werkzeug.wrappers.response.Response:
"""Return canidates for the given article title."""
link_from = flask.request.args.get("link_from")
link_to = flask.request.args.get("link_to")
try:
diff, replacement = get_diff(link_to, link_from, None)
except NoMatch:
return flask.jsonify(valid=False)
return flask.jsonify(valid=True, diff=diff, replacement=replacement)
@app.route("/favicon.ico")
def favicon() -> None:
flask.abort(404)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=8000)