Initial commit
This commit is contained in:
commit
f07b407e7a
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
__pycache__
|
||||||
|
.mypy_cache/
|
||||||
|
node_modules
|
||||||
|
package-lock.json
|
22
add_front_end_libraries.py
Executable file
22
add_front_end_libraries.py
Executable file
|
@ -0,0 +1,22 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
STATIC_DIR = "static"
|
||||||
|
|
||||||
|
assert os.path.exists("package.json") and os.path.exists("node_modules")
|
||||||
|
|
||||||
|
if not os.path.exists(STATIC_DIR):
|
||||||
|
os.mkdir(STATIC_DIR)
|
||||||
|
|
||||||
|
shutil.copytree(
|
||||||
|
"node_modules/bootstrap/dist/",
|
||||||
|
os.path.join(STATIC_DIR, "bootstrap"),
|
||||||
|
dirs_exist_ok=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
subprocess.run(["npm", "run", "build"], check=True)
|
||||||
|
|
||||||
|
shutil.copy("dist/add_links.es.js", "static")
|
0
add_links/__init__.py
Normal file
0
add_links/__init__.py
Normal file
284
add_links/api.py
Normal file
284
add_links/api.py
Normal file
|
@ -0,0 +1,284 @@
|
||||||
|
import re
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
from simplejson.scanner import JSONDecodeError
|
||||||
|
|
||||||
|
from .language import get_current_language
|
||||||
|
from .util import is_disambig
|
||||||
|
|
||||||
|
ua = (
|
||||||
|
"find-link/2.2 "
|
||||||
|
+ "(https://github.com/EdwardBetts/find_link; contact: edward@4angle.com)"
|
||||||
|
)
|
||||||
|
re_disambig = re.compile(r"^(.*) \((.*)\)$")
|
||||||
|
|
||||||
|
|
||||||
|
def get_query_url() -> str:
|
||||||
|
"""Get the wikipedia query API for the current language."""
|
||||||
|
return f"https://{get_current_language()}.wikipedia.org/w/api.php"
|
||||||
|
|
||||||
|
|
||||||
|
sessions = {}
|
||||||
|
|
||||||
|
|
||||||
|
def get_session():
|
||||||
|
lang = get_current_language()
|
||||||
|
if lang in sessions:
|
||||||
|
return sessions[lang]
|
||||||
|
s = requests.Session()
|
||||||
|
s.headers = {"User-Agent": ua}
|
||||||
|
s.mount("https://en.wikipedia.org", HTTPAdapter(max_retries=10))
|
||||||
|
s.params = {
|
||||||
|
"format": "json",
|
||||||
|
"action": "query",
|
||||||
|
"formatversion": 2,
|
||||||
|
}
|
||||||
|
sessions[lang] = s
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
class MediawikiError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class MultipleRedirects(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class IncompleteReply(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class MissingPage(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def check_for_error(json_data):
|
||||||
|
if "error" in json_data:
|
||||||
|
raise MediawikiError(json_data["error"]["info"])
|
||||||
|
|
||||||
|
|
||||||
|
webpage_error = (
|
||||||
|
"Our servers are currently under maintenance or experiencing a technical problem."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def api_get(params: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
"""Make call to Wikipedia API."""
|
||||||
|
s = get_session()
|
||||||
|
|
||||||
|
r = s.get(get_query_url(), params=params)
|
||||||
|
try:
|
||||||
|
ret = r.json()
|
||||||
|
except JSONDecodeError:
|
||||||
|
if webpage_error in r.text:
|
||||||
|
raise MediawikiError(webpage_error)
|
||||||
|
else:
|
||||||
|
raise MediawikiError("unknown error")
|
||||||
|
check_for_error(ret)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
def get_first_page(params: dict[str, str]) -> dict[str, Any]:
|
||||||
|
"""Run Wikipedia API query and return the first page."""
|
||||||
|
page = api_get(params)["query"]["pages"][0]
|
||||||
|
if page.get("missing"):
|
||||||
|
raise MissingPage
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
def random_article_list(limit=50):
|
||||||
|
params = {
|
||||||
|
"list": "random",
|
||||||
|
"rnnamespace": "0",
|
||||||
|
"rnlimit": limit,
|
||||||
|
}
|
||||||
|
|
||||||
|
return api_get(params)["query"]["random"]
|
||||||
|
|
||||||
|
|
||||||
|
def wiki_search(q):
|
||||||
|
m = re_disambig.match(q)
|
||||||
|
if m:
|
||||||
|
search = '"{}" AND "{}"'.format(*m.groups())
|
||||||
|
else:
|
||||||
|
search = '"{}"'.format(q)
|
||||||
|
|
||||||
|
params = {
|
||||||
|
"list": "search",
|
||||||
|
"srwhat": "text",
|
||||||
|
"srlimit": 50,
|
||||||
|
"srsearch": search,
|
||||||
|
"continue": "",
|
||||||
|
}
|
||||||
|
ret = api_get(params)
|
||||||
|
query = ret["query"]
|
||||||
|
totalhits = query["searchinfo"]["totalhits"]
|
||||||
|
results = query["search"]
|
||||||
|
for _ in range(10):
|
||||||
|
if "continue" not in ret:
|
||||||
|
break
|
||||||
|
params["sroffset"] = ret["continue"]["sroffset"]
|
||||||
|
ret = api_get(params)
|
||||||
|
results += ret["query"]["search"]
|
||||||
|
return (totalhits, results)
|
||||||
|
|
||||||
|
|
||||||
|
def get_wiki_info(q):
|
||||||
|
params = {
|
||||||
|
"prop": "info",
|
||||||
|
"redirects": "",
|
||||||
|
"titles": q,
|
||||||
|
}
|
||||||
|
ret = api_get(params)["query"]
|
||||||
|
if "interwiki" in ret:
|
||||||
|
return None
|
||||||
|
redirects = []
|
||||||
|
if ret.get("redirects"):
|
||||||
|
redirects = ret["redirects"]
|
||||||
|
if len(redirects) != 1:
|
||||||
|
# multiple redirects, we should explain to the user that this is
|
||||||
|
# unsupported
|
||||||
|
raise MultipleRedirects
|
||||||
|
if ret["pages"][0].get("missing"):
|
||||||
|
raise MissingPage(q)
|
||||||
|
return redirects[0]["to"] if redirects else None
|
||||||
|
|
||||||
|
|
||||||
|
def cat_start(q: str) -> list[str]:
|
||||||
|
"""Find categories that start with this prefix."""
|
||||||
|
params = {
|
||||||
|
"list": "allpages",
|
||||||
|
"apnamespace": 14, # categories
|
||||||
|
"apfilterredir": "nonredirects",
|
||||||
|
"aplimit": 500,
|
||||||
|
"apprefix": q,
|
||||||
|
}
|
||||||
|
ret = api_get(params)["query"]
|
||||||
|
return [i["title"] for i in ret["allpages"] if i["title"] != q]
|
||||||
|
|
||||||
|
|
||||||
|
def all_pages(q: str) -> list[str]:
|
||||||
|
"""Get all article titles with a given prefix."""
|
||||||
|
params = {
|
||||||
|
"list": "allpages",
|
||||||
|
"apnamespace": 0,
|
||||||
|
"apfilterredir": "nonredirects",
|
||||||
|
"aplimit": 500,
|
||||||
|
"apprefix": q,
|
||||||
|
}
|
||||||
|
ret = api_get(params)["query"]
|
||||||
|
return [i["title"] for i in ret["allpages"] if i["title"] != q]
|
||||||
|
|
||||||
|
|
||||||
|
def categorymembers(q: str) -> list[str]:
|
||||||
|
"""List of category members."""
|
||||||
|
params = {
|
||||||
|
"list": "categorymembers",
|
||||||
|
"cmnamespace": 0,
|
||||||
|
"cmlimit": 500,
|
||||||
|
"cmtitle": q[0].upper() + q[1:],
|
||||||
|
}
|
||||||
|
ret = api_get(params)["query"]
|
||||||
|
return [i["title"] for i in ret["categorymembers"] if i["title"] != q]
|
||||||
|
|
||||||
|
|
||||||
|
def page_links(titles): # unused
|
||||||
|
titles = list(titles)
|
||||||
|
assert titles
|
||||||
|
params = {
|
||||||
|
"prop": "links",
|
||||||
|
"pllimit": 500,
|
||||||
|
"plnamespace": 0,
|
||||||
|
"titles": "|".join(titles),
|
||||||
|
}
|
||||||
|
ret = api_get(params)["query"]
|
||||||
|
return dict(
|
||||||
|
(doc["title"], {l["title"] for l in doc["links"]})
|
||||||
|
for doc in ret["pages"].values()
|
||||||
|
if "links" in doc
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def find_disambig(titles: list[str]) -> list[str]:
|
||||||
|
"""Find disambiguation articles in the given list of titles."""
|
||||||
|
titles = list(titles)
|
||||||
|
assert titles
|
||||||
|
pos = 0
|
||||||
|
disambig: list[str] = []
|
||||||
|
params = {
|
||||||
|
"prop": "templates",
|
||||||
|
"tllimit": 500,
|
||||||
|
"tlnamespace": 10, # templates
|
||||||
|
"continue": "",
|
||||||
|
}
|
||||||
|
while pos < len(titles):
|
||||||
|
params["titles"] = "|".join(titles[pos : pos + 50])
|
||||||
|
ret = api_get(params)
|
||||||
|
disambig.extend(
|
||||||
|
doc["title"] for doc in ret["query"]["pages"] if is_disambig(doc)
|
||||||
|
)
|
||||||
|
for i in range(10):
|
||||||
|
if "continue" not in ret:
|
||||||
|
break
|
||||||
|
tlcontinue = ret["continue"]["tlcontinue"]
|
||||||
|
params["titles"] = "|".join(titles[pos : pos + 50])
|
||||||
|
params["tlcontinue"] = tlcontinue
|
||||||
|
ret = api_get(params)
|
||||||
|
disambig.extend(
|
||||||
|
doc["title"] for doc in ret["query"]["pages"] if is_disambig(doc)
|
||||||
|
)
|
||||||
|
pos += 50
|
||||||
|
|
||||||
|
return disambig
|
||||||
|
|
||||||
|
|
||||||
|
def wiki_redirects(q): # pages that link here
|
||||||
|
params = {
|
||||||
|
"list": "backlinks",
|
||||||
|
"blfilterredir": "redirects",
|
||||||
|
"bllimit": 500,
|
||||||
|
"blnamespace": 0,
|
||||||
|
"bltitle": q,
|
||||||
|
}
|
||||||
|
docs = api_get(params)["query"]["backlinks"]
|
||||||
|
assert all("redirect" in doc for doc in docs)
|
||||||
|
return (doc["title"] for doc in docs)
|
||||||
|
|
||||||
|
|
||||||
|
def wiki_backlink(q: str) -> tuple[set[str], set[str]]:
|
||||||
|
"""Get backlinks for article."""
|
||||||
|
params = {
|
||||||
|
"list": "backlinks",
|
||||||
|
"bllimit": 500,
|
||||||
|
"blnamespace": 0,
|
||||||
|
"bltitle": q,
|
||||||
|
"continue": "",
|
||||||
|
}
|
||||||
|
ret = api_get(params)
|
||||||
|
docs = ret["query"]["backlinks"]
|
||||||
|
while "continue" in ret:
|
||||||
|
params["blcontinue"] = ret["continue"]["blcontinue"]
|
||||||
|
ret = api_get(params)
|
||||||
|
docs += ret["query"]["backlinks"]
|
||||||
|
|
||||||
|
articles = {doc["title"] for doc in docs if "redirect" not in doc}
|
||||||
|
redirects = {doc["title"] for doc in docs if "redirect" in doc}
|
||||||
|
return (articles, redirects)
|
||||||
|
|
||||||
|
|
||||||
|
def call_get_diff(title, section_num, section_text):
|
||||||
|
data = {
|
||||||
|
"prop": "revisions",
|
||||||
|
"rvprop": "timestamp",
|
||||||
|
"titles": title,
|
||||||
|
"rvsection": section_num,
|
||||||
|
"rvdifftotext": section_text.strip(),
|
||||||
|
}
|
||||||
|
|
||||||
|
s = get_session()
|
||||||
|
ret = s.post(get_query_url(), data=data).json()
|
||||||
|
check_for_error(ret)
|
||||||
|
return ret["query"]["pages"][0]["revisions"][0]["diff"]["body"]
|
198
add_links/core.py
Normal file
198
add_links/core.py
Normal file
|
@ -0,0 +1,198 @@
|
||||||
|
"""Core functions."""
|
||||||
|
|
||||||
|
import html
|
||||||
|
import re
|
||||||
|
import typing
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
|
from .api import (
|
||||||
|
MediawikiError,
|
||||||
|
all_pages,
|
||||||
|
cat_start,
|
||||||
|
categorymembers,
|
||||||
|
find_disambig,
|
||||||
|
get_first_page,
|
||||||
|
wiki_backlink,
|
||||||
|
wiki_search,
|
||||||
|
)
|
||||||
|
from .util import case_flip_first, norm
|
||||||
|
|
||||||
|
re_redirect = re.compile(r"#REDIRECT \[\[(.)([^#]*?)(#.*)?\]\]")
|
||||||
|
|
||||||
|
|
||||||
|
def get_content_and_timestamp(title: str) -> tuple[str, str]:
|
||||||
|
"""Get article content and timestamp of last update."""
|
||||||
|
params = {
|
||||||
|
"prop": "revisions|info",
|
||||||
|
"rvprop": "content|timestamp",
|
||||||
|
"titles": title,
|
||||||
|
}
|
||||||
|
json_data: dict[str, typing.Any] = get_first_page(params)
|
||||||
|
if json_data.get("invalid"):
|
||||||
|
raise MediawikiError(json_data["invalidreason"])
|
||||||
|
rev = json_data["revisions"][0]
|
||||||
|
return (rev["content"], rev["timestamp"])
|
||||||
|
|
||||||
|
|
||||||
|
def get_revision_info(title: str) -> dict[str, typing.Any]:
|
||||||
|
"""Get info about latest revision of article."""
|
||||||
|
params = {
|
||||||
|
"prop": "revisions|info",
|
||||||
|
"rvprop": "content|timestamp|ids",
|
||||||
|
"titles": title,
|
||||||
|
}
|
||||||
|
json_data: dict[str, typing.Any] = get_first_page(params)
|
||||||
|
if json_data.get("invalid"):
|
||||||
|
raise MediawikiError(json_data["invalidreason"])
|
||||||
|
revs = json_data.pop("revisions")
|
||||||
|
ret = revs[0]
|
||||||
|
ret["pageid"] = json_data["pageid"]
|
||||||
|
pprint(json_data)
|
||||||
|
return typing.cast(dict[str, typing.Any], ret)
|
||||||
|
|
||||||
|
|
||||||
|
def is_redirect_to(title_from: str, title_to: str) -> bool:
|
||||||
|
title_from = title_from.replace("_", " ")
|
||||||
|
params = {"prop": "info", "titles": title_from}
|
||||||
|
if "redirect" not in get_first_page(params):
|
||||||
|
return False
|
||||||
|
|
||||||
|
params = {"prop": "revisions", "rvprop": "content", "titles": title_from}
|
||||||
|
page_text = get_first_page(params)["revisions"][0]["content"]
|
||||||
|
m = re_redirect.match(page_text)
|
||||||
|
assert m
|
||||||
|
title_to = title_to[0].upper() + title_to[1:]
|
||||||
|
return m.group(1).upper() + m.group(2) == title_to
|
||||||
|
|
||||||
|
|
||||||
|
def find_longer(
|
||||||
|
q: str, search: list[dict[str, typing.Any]], articles: set[str]
|
||||||
|
) -> list[str]:
|
||||||
|
"""Find other articles with titles that are longer."""
|
||||||
|
this_title = q[0].upper() + q[1:]
|
||||||
|
longer: list[str] = all_pages(this_title)
|
||||||
|
lq = q.lower()
|
||||||
|
for doc in search:
|
||||||
|
lt = doc["title"].lower()
|
||||||
|
if lq == lt or lq not in lt:
|
||||||
|
continue
|
||||||
|
articles.add(doc["title"])
|
||||||
|
more_articles, more_redirects = wiki_backlink(doc["title"])
|
||||||
|
articles.update(more_articles)
|
||||||
|
if doc["title"] not in longer:
|
||||||
|
longer.append(doc["title"])
|
||||||
|
|
||||||
|
return longer
|
||||||
|
|
||||||
|
|
||||||
|
def tidy_snippet(snippet: str) -> str:
|
||||||
|
"""Remove HTML from snippet."""
|
||||||
|
snippet = snippet.replace("\u2013", "-")
|
||||||
|
snippet = snippet.replace("</span>", "")
|
||||||
|
snippet = snippet.replace('<span class="searchmatch">', "")
|
||||||
|
return html.unescape(snippet)
|
||||||
|
|
||||||
|
|
||||||
|
def match_type(q: str, snippet: str) -> str | None:
|
||||||
|
"""Discover match type, ''exact', 'case_mismatch' or None.
|
||||||
|
|
||||||
|
>>> match_type('foo', 'foo')
|
||||||
|
'exact'
|
||||||
|
>>> match_type('foo', 'bar') is None
|
||||||
|
True
|
||||||
|
>>> match_type('bar', 'foo bar baz')
|
||||||
|
'exact'
|
||||||
|
>>> match_type('clean coal technology', 'foo clean coal technologies baz')
|
||||||
|
'exact'
|
||||||
|
>>> match_type('bar', 'foo Bar baz')
|
||||||
|
'exact'
|
||||||
|
>>> match_type('bar', 'foo BAR baz')
|
||||||
|
'case_mismatch'
|
||||||
|
>>> match_type('foo-bar', 'aa foo-bar cc')
|
||||||
|
'exact'
|
||||||
|
>>> match_type(u'foo\u2013bar', 'aa foo-bar cc')
|
||||||
|
'exact'
|
||||||
|
"""
|
||||||
|
q = q.replace("\u2013", "-")
|
||||||
|
snippet = tidy_snippet(snippet)
|
||||||
|
|
||||||
|
if q in snippet or case_flip_first(q) in snippet:
|
||||||
|
return "exact"
|
||||||
|
match = None
|
||||||
|
if q.lower() in snippet.lower():
|
||||||
|
match = "case_mismatch"
|
||||||
|
if match != "exact" and q.endswith("y"):
|
||||||
|
if q[:-1] in snippet or case_flip_first(q[:-1]) in snippet:
|
||||||
|
return "exact"
|
||||||
|
elif match is None:
|
||||||
|
if q[:-1].lower() in snippet.lower():
|
||||||
|
match = "case_mismatch"
|
||||||
|
return match
|
||||||
|
|
||||||
|
|
||||||
|
def do_search(
|
||||||
|
q: str, redirect_to: str | None = None
|
||||||
|
) -> dict[str, int | list[dict[str, typing.Any]] | list[str] | None]:
|
||||||
|
this_title = q[0].upper() + q[1:]
|
||||||
|
|
||||||
|
totalhits, search_hits = wiki_search(q)
|
||||||
|
articles, redirects = wiki_backlink(redirect_to or q)
|
||||||
|
cm = set()
|
||||||
|
start = cat_start(q)
|
||||||
|
if len(start) > 5:
|
||||||
|
start = [] # big categories take too long
|
||||||
|
for cat in set(["Category:" + this_title] + start):
|
||||||
|
cm.update(categorymembers(cat))
|
||||||
|
|
||||||
|
norm_q = norm(q)
|
||||||
|
norm_match_redirect = {r for r in redirects if norm(r) == norm_q}
|
||||||
|
longer_redirect = {r for r in redirects if q.lower() in r.lower()}
|
||||||
|
|
||||||
|
articles.add(this_title)
|
||||||
|
if redirect_to:
|
||||||
|
articles.add(redirect_to[0].upper() + redirect_to[1:])
|
||||||
|
|
||||||
|
longer_redirect = {r for r in redirects if q.lower() in r.lower()}
|
||||||
|
for r in norm_match_redirect | longer_redirect:
|
||||||
|
articles.add(r)
|
||||||
|
a2, r2 = wiki_backlink(r)
|
||||||
|
articles.update(a2)
|
||||||
|
redirects.update(r2)
|
||||||
|
|
||||||
|
longer = find_longer(q, search_hits, articles) if len(q) > 6 else None
|
||||||
|
|
||||||
|
search: list[dict[str, typing.Any]] = [
|
||||||
|
doc
|
||||||
|
for doc in search_hits
|
||||||
|
if doc["title"] not in articles and doc["title"] not in cm
|
||||||
|
]
|
||||||
|
if search:
|
||||||
|
disambig = set(find_disambig([doc["title"] for doc in search]))
|
||||||
|
search = [doc for doc in search if doc["title"] not in disambig]
|
||||||
|
# and (doc['title'] not in links or this_title not in links[doc['title']])]
|
||||||
|
for doc in search:
|
||||||
|
without_markup = (
|
||||||
|
doc["snippet"]
|
||||||
|
.replace("<span class='searchmatch'>", "")
|
||||||
|
.replace("</span>", "")
|
||||||
|
.replace(" ", " ")
|
||||||
|
)
|
||||||
|
doc["match"] = match_type(q, without_markup)
|
||||||
|
doc["snippet_without_markup"] = without_markup
|
||||||
|
return {
|
||||||
|
"totalhits": totalhits,
|
||||||
|
"results": search,
|
||||||
|
"longer": longer,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_case_from_content(title: str) -> str | None:
|
||||||
|
"""Check article content to find the case of the article title."""
|
||||||
|
content, timestamp = get_content_and_timestamp(title)
|
||||||
|
if title == title.lower() and title in content:
|
||||||
|
return title
|
||||||
|
start = content.lower().find("'''" + title.replace("_", " ").lower() + "'''")
|
||||||
|
if start != -1:
|
||||||
|
return content[start + 3 : start + 3 + len(title)]
|
||||||
|
|
||||||
|
return None # article doesn't contain the title
|
146
add_links/language.py
Normal file
146
add_links/language.py
Normal file
|
@ -0,0 +1,146 @@
|
||||||
|
from flask import session, has_request_context
|
||||||
|
|
||||||
|
langs = [
|
||||||
|
('af', 'Afrikaans', 'Afrikaans'),
|
||||||
|
('als', 'Alemannisch', 'Alemannic'),
|
||||||
|
('am', 'አማርኛ', 'Amharic'),
|
||||||
|
('an', 'aragonés', 'Aragonese'),
|
||||||
|
('ar', 'العربية', 'Arabic'),
|
||||||
|
('arz', 'مصرى', 'Egyptian Arabic'),
|
||||||
|
('ast', 'asturianu', 'Asturian'),
|
||||||
|
('az', 'azərbaycanca', 'Azerbaijani'),
|
||||||
|
('azb', 'تۆرکجه', 'Southern Azerbaijani'),
|
||||||
|
('ba', 'башҡортса', 'Bashkir'),
|
||||||
|
('bar', 'Boarisch', 'Bavarian'),
|
||||||
|
('bat-smg', 'žemaitėška', 'Samogitian'),
|
||||||
|
('be', 'беларуская', 'Belarusian'),
|
||||||
|
('be-tarask', 'беларуская (тарашкевіца)', 'Belarusian (Taraškievica)'),
|
||||||
|
('bg', 'български', 'Bulgarian'),
|
||||||
|
('bn', 'বাংলা', 'Bengali'),
|
||||||
|
('bpy', 'বিষ্ণুপ্রিয়া মণিপুরী', 'Bishnupriya Manipuri'),
|
||||||
|
('br', 'brezhoneg', 'Breton'),
|
||||||
|
('bs', 'bosanski', 'Bosnian'),
|
||||||
|
('bug', 'ᨅᨔ ᨕᨘᨁᨗ', 'Buginese'),
|
||||||
|
('ca', 'català', 'Catalan'),
|
||||||
|
('ce', 'нохчийн', 'Chechen'),
|
||||||
|
('ceb', 'Cebuano', 'Cebuano'),
|
||||||
|
('ckb', 'کوردیی ناوەندی', 'Kurdish (Sorani)'),
|
||||||
|
('cs', 'čeština', 'Czech'),
|
||||||
|
('cv', 'Чӑвашла', 'Chuvash'),
|
||||||
|
('cy', 'Cymraeg', 'Welsh'),
|
||||||
|
('da', 'dansk', 'Danish'),
|
||||||
|
('de', 'Deutsch', 'German'),
|
||||||
|
('el', 'Ελληνικά', 'Greek'),
|
||||||
|
('en', 'English', 'English'),
|
||||||
|
('eo', 'Esperanto', 'Esperanto'),
|
||||||
|
('es', 'español', 'Spanish'),
|
||||||
|
('et', 'eesti', 'Estonian'),
|
||||||
|
('eu', 'euskara', 'Basque'),
|
||||||
|
('fa', 'فارسی', 'Persian'),
|
||||||
|
('fi', 'suomi', 'Finnish'),
|
||||||
|
('fo', 'føroyskt', 'Faroese'),
|
||||||
|
('fr', 'français', 'French'),
|
||||||
|
('fy', 'Frysk', 'West Frisian'),
|
||||||
|
('ga', 'Gaeilge', 'Irish'),
|
||||||
|
('gd', 'Gàidhlig', 'Scottish Gaelic'),
|
||||||
|
('gl', 'galego', 'Galician'),
|
||||||
|
('gu', 'ગુજરાતી', 'Gujarati'),
|
||||||
|
('he', 'עברית', 'Hebrew'),
|
||||||
|
('hi', 'हिन्दी', 'Hindi'),
|
||||||
|
('hr', 'hrvatski', 'Croatian'),
|
||||||
|
('hsb', 'hornjoserbsce', 'Upper Sorbian'),
|
||||||
|
('ht', 'Kreyòl ayisyen', 'Haitian'),
|
||||||
|
('hu', 'magyar', 'Hungarian'),
|
||||||
|
('hy', 'Հայերեն', 'Armenian'),
|
||||||
|
('ia', 'interlingua', 'Interlingua'),
|
||||||
|
('id', 'Bahasa Indonesia', 'Indonesian'),
|
||||||
|
('io', 'Ido', 'Ido'),
|
||||||
|
('is', 'íslenska', 'Icelandic'),
|
||||||
|
('it', 'italiano', 'Italian'),
|
||||||
|
('ja', '日本語', 'Japanese'),
|
||||||
|
('jv', 'Basa Jawa', 'Javanese'),
|
||||||
|
('ka', 'ქართული', 'Georgian'),
|
||||||
|
('kk', 'қазақша', 'Kazakh'),
|
||||||
|
('kn', 'ಕನ್ನಡ', 'Kannada'),
|
||||||
|
('ko', '한국어', 'Korean'),
|
||||||
|
('ku', 'Kurdî', 'Kurdish (Kurmanji)'),
|
||||||
|
('ky', 'Кыргызча', 'Kirghiz'),
|
||||||
|
('la', 'Latina', 'Latin'),
|
||||||
|
('lb', 'Lëtzebuergesch', 'Luxembourgish'),
|
||||||
|
('li', 'Limburgs', 'Limburgish'),
|
||||||
|
('lmo', 'lumbaart', 'Lombard'),
|
||||||
|
('lt', 'lietuvių', 'Lithuanian'),
|
||||||
|
('lv', 'latviešu', 'Latvian'),
|
||||||
|
('map-bms', 'Basa Banyumasan', 'Banyumasan'),
|
||||||
|
('mg', 'Malagasy', 'Malagasy'),
|
||||||
|
('min', 'Baso Minangkabau', 'Minangkabau'),
|
||||||
|
('mk', 'македонски', 'Macedonian'),
|
||||||
|
('ml', 'മലയാളം', 'Malayalam'),
|
||||||
|
('mn', 'монгол', 'Mongolian'),
|
||||||
|
('mr', 'मराठी', 'Marathi'),
|
||||||
|
('mrj', 'кырык мары', 'Hill Mari'),
|
||||||
|
('ms', 'Bahasa Melayu', 'Malay'),
|
||||||
|
('my', 'မြန်မာဘာသာ', 'Burmese'),
|
||||||
|
('mzn', 'مازِرونی', 'Mazandarani'),
|
||||||
|
('nah', 'Nāhuatl', 'Nahuatl'),
|
||||||
|
('nap', 'Napulitano', 'Neapolitan'),
|
||||||
|
('nds', 'Plattdüütsch', 'Low Saxon'),
|
||||||
|
('ne', 'नेपाली', 'Nepali'),
|
||||||
|
('new', 'नेपाल भाषा', 'Newar'),
|
||||||
|
('nl', 'Nederlands', 'Dutch'),
|
||||||
|
('nn', 'norsk nynorsk', 'Norwegian (Nynorsk)'),
|
||||||
|
('no', 'norsk bokmål', 'Norwegian (Bokmål)'),
|
||||||
|
('oc', 'occitan', 'Occitan'),
|
||||||
|
('or', 'ଓଡ଼ିଆ', 'Oriya'),
|
||||||
|
('os', 'Ирон', 'Ossetian'),
|
||||||
|
('pa', 'ਪੰਜਾਬੀ', 'Eastern Punjabi'),
|
||||||
|
('pl', 'polski', 'Polish'),
|
||||||
|
('pms', 'Piemontèis', 'Piedmontese'),
|
||||||
|
('pnb', 'پنجابی', 'Western Punjabi'),
|
||||||
|
('pt', 'português', 'Portuguese'),
|
||||||
|
('qu', 'Runa Simi', 'Quechua'),
|
||||||
|
('ro', 'română', 'Romanian'),
|
||||||
|
('ru', 'русский', 'Russian'),
|
||||||
|
('sa', 'संस्कृतम्', 'Sanskrit'),
|
||||||
|
('sah', 'саха тыла', 'Sakha'),
|
||||||
|
('scn', 'sicilianu', 'Sicilian'),
|
||||||
|
('sco', 'Scots', 'Scots'),
|
||||||
|
('sh', 'srpskohrvatski / српскохрватски', 'Serbo-Croatian'),
|
||||||
|
('si', 'සිංහල', 'Sinhalese'),
|
||||||
|
('simple', 'Simple English', 'Simple English'),
|
||||||
|
('sk', 'slovenčina', 'Slovak'),
|
||||||
|
('sl', 'slovenščina', 'Slovenian'),
|
||||||
|
('sq', 'shqip', 'Albanian'),
|
||||||
|
('sr', 'српски / srpski', 'Serbian'),
|
||||||
|
('su', 'Basa Sunda', 'Sundanese'),
|
||||||
|
('sv', 'svenska', 'Swedish'),
|
||||||
|
('sw', 'Kiswahili', 'Swahili'),
|
||||||
|
('ta', 'தமிழ்', 'Tamil'),
|
||||||
|
('te', 'తెలుగు', 'Telugu'),
|
||||||
|
('tg', 'тоҷикӣ', 'Tajik'),
|
||||||
|
('th', 'ไทย', 'Thai'),
|
||||||
|
('tl', 'Tagalog', 'Tagalog'),
|
||||||
|
('tr', 'Türkçe', 'Turkish'),
|
||||||
|
('tt', 'татарча/tatarça', 'Tatar'),
|
||||||
|
('uk', 'українська', 'Ukrainian'),
|
||||||
|
('ur', 'اردو', 'Urdu'),
|
||||||
|
('uz', 'oʻzbekcha/ўзбекча', 'Uzbek'),
|
||||||
|
('vec', 'vèneto', 'Venetian'),
|
||||||
|
('vi', 'Tiếng Việt', 'Vietnamese'),
|
||||||
|
('vo', 'Volapük', 'Volapük'),
|
||||||
|
('wa', 'walon', 'Walloon'),
|
||||||
|
('war', 'Winaray', 'Waray'),
|
||||||
|
('yi', 'ייִדיש', 'Yiddish'),
|
||||||
|
('yo', 'Yorùbá', 'Yoruba'),
|
||||||
|
('zh', '中文', 'Chinese'),
|
||||||
|
('zh-min-nan', 'Bân-lâm-gú', 'Min Nan'),
|
||||||
|
('zh-yue', '粵語', 'Cantonese'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_langs() -> list[dict[str, str]]:
|
||||||
|
"""List of all known languages."""
|
||||||
|
return [dict(zip(('code', 'local', 'english'), l)) for l in langs]
|
||||||
|
|
||||||
|
def get_current_language() -> str:
|
||||||
|
"""Return ISO-3166 language code for the current language."""
|
||||||
|
return session.get('current_lang', 'en') if has_request_context() else 'en'
|
381
add_links/match.py
Normal file
381
add_links/match.py
Normal file
|
@ -0,0 +1,381 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import re
|
||||||
|
import typing
|
||||||
|
|
||||||
|
from .api import MissingPage, call_get_diff, get_wiki_info
|
||||||
|
from .core import get_case_from_content, get_content_and_timestamp, get_revision_info
|
||||||
|
from .util import is_title_case, lc_alpha
|
||||||
|
|
||||||
|
re_link_in_text = re.compile(r"\[\[[^]]+?\]\]", re.I | re.S)
|
||||||
|
|
||||||
|
|
||||||
|
class LinkReplace(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
en_dash = "\u2013"
|
||||||
|
trans = {",": ",?", " ": " *[-\n]? *"}
|
||||||
|
trans[en_dash] = trans[" "]
|
||||||
|
|
||||||
|
trans2 = {" ": r"('?s?\]\])?'?s? ?(\[\[(?:.+\|)?)?", "-": "[- ]"}
|
||||||
|
trans2[en_dash] = trans2[" "]
|
||||||
|
|
||||||
|
patterns = [
|
||||||
|
lambda q: re.compile(
|
||||||
|
r"(?<!-)(?:\[\[(?:[^]]+\|)?)?(%s)%s(?:\]\])?"
|
||||||
|
% (
|
||||||
|
re.escape(q[0]),
|
||||||
|
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
|
||||||
|
),
|
||||||
|
re.I,
|
||||||
|
),
|
||||||
|
lambda q: re.compile(
|
||||||
|
r"(?<!-)\[\[[^|]+\|(%s)%s\]\]" % (re.escape(q[0]), re.escape(q[1:])), re.I
|
||||||
|
),
|
||||||
|
lambda q: re.compile(
|
||||||
|
r"(?<!-)\[\[[^|]+\|(%s)%s(?:\]\])?"
|
||||||
|
% (
|
||||||
|
re.escape(q[0]),
|
||||||
|
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
|
||||||
|
),
|
||||||
|
re.I,
|
||||||
|
),
|
||||||
|
lambda q: re.compile(r"(?<!-)(%s)%s" % (re.escape(q[0]), re.escape(q[1:])), re.I),
|
||||||
|
lambda q: re.compile(
|
||||||
|
r"(?<!-)(%s)%s"
|
||||||
|
% (
|
||||||
|
re.escape(q[0]),
|
||||||
|
"".join((trans[c] if c in trans else re.escape(c)) for c in q[1:]),
|
||||||
|
),
|
||||||
|
re.I,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class NoMatch(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
re_cite = re.compile(
|
||||||
|
r"<ref( [^>]*?)?>\s*({{cite.*?}}|\[https?://[^]]*?\])\s*</ref>", re.I | re.S
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_cite(text: str) -> typing.Iterator[tuple[str, str]]:
|
||||||
|
"""Parse a citation template."""
|
||||||
|
prev = 0
|
||||||
|
for m in re_cite.finditer(text):
|
||||||
|
yield ("text", text[prev : m.start()])
|
||||||
|
yield ("cite", m.group(0))
|
||||||
|
prev = m.end()
|
||||||
|
yield ("text", text[prev:])
|
||||||
|
|
||||||
|
|
||||||
|
re_heading = re.compile(r"^\s*(=+)\s*(.+)\s*\1(<!--.*-->|\s)*$")
|
||||||
|
|
||||||
|
|
||||||
|
def section_iter(text: str) -> typing.Iterator[tuple[str | None, str]]:
|
||||||
|
"""Iterate sections yielding tuples of heading and section text."""
|
||||||
|
cur_section = ""
|
||||||
|
heading = None
|
||||||
|
in_comment = False
|
||||||
|
for line in text.splitlines(True):
|
||||||
|
if "<!--" in line:
|
||||||
|
in_comment = True
|
||||||
|
if "-->" in line:
|
||||||
|
in_comment = False
|
||||||
|
m = re_heading.match(line)
|
||||||
|
if in_comment or not m:
|
||||||
|
cur_section += line
|
||||||
|
continue
|
||||||
|
if cur_section or heading:
|
||||||
|
yield (heading, cur_section)
|
||||||
|
heading = m.group()
|
||||||
|
cur_section = ""
|
||||||
|
continue
|
||||||
|
yield (heading, cur_section)
|
||||||
|
|
||||||
|
|
||||||
|
def get_subsections(text: str, section_num: int) -> str:
|
||||||
|
"retrieve the text of subsections for a given section number within an article"
|
||||||
|
found = ""
|
||||||
|
collection_level = None
|
||||||
|
for num, (heading, body) in enumerate(section_iter(text)):
|
||||||
|
if heading is None:
|
||||||
|
level = 0
|
||||||
|
else:
|
||||||
|
m = re_heading.match(heading)
|
||||||
|
assert m
|
||||||
|
level = len(m.group(1))
|
||||||
|
if num == section_num:
|
||||||
|
collection_level = level
|
||||||
|
continue
|
||||||
|
if collection_level:
|
||||||
|
if level > collection_level:
|
||||||
|
assert heading
|
||||||
|
found += heading + body
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
return found
|
||||||
|
|
||||||
|
|
||||||
|
def match_found(m, q, linkto):
|
||||||
|
if q[1:] == m.group(0)[1:]:
|
||||||
|
replacement = m.group(1) + q[1:]
|
||||||
|
elif any(c.isupper() for c in q[1:]) or m.group(0) == m.group(0).upper():
|
||||||
|
replacement = q
|
||||||
|
elif is_title_case(m.group(0)):
|
||||||
|
replacement = None
|
||||||
|
replacement = get_case_from_content(q)
|
||||||
|
if replacement is None:
|
||||||
|
replacement = q.lower()
|
||||||
|
else:
|
||||||
|
replacement = m.group(1) + q[1:]
|
||||||
|
assert replacement
|
||||||
|
if linkto:
|
||||||
|
if linkto[0].isupper() and replacement[0] == linkto[0].lower():
|
||||||
|
linkto = linkto[0].lower() + linkto[1:]
|
||||||
|
elif replacement[0].isupper():
|
||||||
|
linkto = linkto[0].upper() + linkto[1:]
|
||||||
|
replacement = linkto + "|" + replacement
|
||||||
|
return replacement
|
||||||
|
|
||||||
|
|
||||||
|
def parse_links(text: str) -> typing.Iterator[tuple[str, str]]:
|
||||||
|
prev = 0
|
||||||
|
for m in re_link_in_text.finditer(text):
|
||||||
|
if prev != m.start():
|
||||||
|
yield ("text", text[prev : m.start()])
|
||||||
|
if any(
|
||||||
|
m.group().lower().startswith("[[" + prefix)
|
||||||
|
for prefix in ("file:", "image:")
|
||||||
|
):
|
||||||
|
yield ("image", m.group(0))
|
||||||
|
else:
|
||||||
|
yield ("link", m.group(0))
|
||||||
|
prev = m.end()
|
||||||
|
if prev < len(text):
|
||||||
|
yield ("text", text[prev:])
|
||||||
|
|
||||||
|
|
||||||
|
def mk_link_matcher(q):
|
||||||
|
re_links = [p(q) for p in patterns]
|
||||||
|
|
||||||
|
def search_for_link(text):
|
||||||
|
for re_link in re_links:
|
||||||
|
m = re_link.search(text)
|
||||||
|
if m and m.group(0).count("[[") < 4:
|
||||||
|
return m
|
||||||
|
|
||||||
|
return search_for_link
|
||||||
|
|
||||||
|
|
||||||
|
def add_link(m, replacement, text):
|
||||||
|
return m.re.sub(lambda m: "[[%s]]" % replacement, text, count=1)
|
||||||
|
|
||||||
|
|
||||||
|
def find_link_in_chunk(q, content, linkto=None):
|
||||||
|
search_for_link = mk_link_matcher(q)
|
||||||
|
new_content = ""
|
||||||
|
replacement = None
|
||||||
|
|
||||||
|
match_in_non_link = False
|
||||||
|
bad_link_match = False
|
||||||
|
found_text_to_link = None
|
||||||
|
|
||||||
|
for token_type, text in parse_links(content):
|
||||||
|
if token_type == "text":
|
||||||
|
if search_for_link(text):
|
||||||
|
match_in_non_link = True
|
||||||
|
elif token_type == "image":
|
||||||
|
before, sep, link_text = text[:-2].rpartition("|")
|
||||||
|
m = search_for_link(link_text)
|
||||||
|
if m:
|
||||||
|
found_text_to_link = m.group(0)
|
||||||
|
replacement = match_found(m, q, linkto)
|
||||||
|
text = before + sep + add_link(m, replacement, link_text) + "]]"
|
||||||
|
elif token_type == "link" and not replacement and not match_in_non_link:
|
||||||
|
link_text = text[2:-2]
|
||||||
|
link_dest = None
|
||||||
|
if "|" in link_text:
|
||||||
|
link_dest, link_text = link_text.split("|", 1)
|
||||||
|
m = search_for_link(link_text)
|
||||||
|
if m and (not link_dest or not link_dest.startswith("#")):
|
||||||
|
lc_alpha_q = lc_alpha(q)
|
||||||
|
|
||||||
|
bad_link_match = (
|
||||||
|
link_dest
|
||||||
|
and len(link_dest) > len(q)
|
||||||
|
and (lc_alpha_q not in lc_alpha(link_dest))
|
||||||
|
)
|
||||||
|
if not link_dest:
|
||||||
|
if q in link_text and len(link_text) > len(q):
|
||||||
|
bad_link_match = True
|
||||||
|
if bad_link_match and link_dest:
|
||||||
|
try:
|
||||||
|
link_dest_redirect = get_wiki_info(link_dest)
|
||||||
|
except MissingPage:
|
||||||
|
link_dest_redirect = None
|
||||||
|
if (
|
||||||
|
link_dest_redirect
|
||||||
|
and lc_alpha(link_dest_redirect) == lc_alpha_q
|
||||||
|
):
|
||||||
|
bad_link_match = False
|
||||||
|
if not bad_link_match:
|
||||||
|
replacement = match_found(m, q, linkto)
|
||||||
|
found_text_to_link = m.group(0)
|
||||||
|
text = add_link(m, replacement, link_text)
|
||||||
|
new_content += text
|
||||||
|
if not replacement:
|
||||||
|
if bad_link_match:
|
||||||
|
raise LinkReplace
|
||||||
|
m = search_for_link(content)
|
||||||
|
if m:
|
||||||
|
found_text_to_link = m.group(0)
|
||||||
|
replacement = match_found(m, q, linkto)
|
||||||
|
new_content = add_link(m, replacement, content)
|
||||||
|
if linkto:
|
||||||
|
m_end = m.end()
|
||||||
|
re_extend = re.compile(m.re.pattern + r"\w*\b", re.I)
|
||||||
|
m = re_extend.search(content)
|
||||||
|
if m and m.end() > m_end:
|
||||||
|
replacement += content[m_end : m.end()]
|
||||||
|
new_content = add_link(m, replacement, content)
|
||||||
|
return (new_content, replacement, found_text_to_link)
|
||||||
|
|
||||||
|
|
||||||
|
def find_link_in_text(q, content):
|
||||||
|
(new_content, replacement) = find_link_in_chunk(q, content)
|
||||||
|
if replacement:
|
||||||
|
return (new_content, replacement)
|
||||||
|
raise NoMatch
|
||||||
|
|
||||||
|
|
||||||
|
def find_link_in_content(q, content, linkto=None):
|
||||||
|
if linkto:
|
||||||
|
try:
|
||||||
|
return find_link_in_content(linkto, content)
|
||||||
|
except NoMatch:
|
||||||
|
pass
|
||||||
|
replacement = None
|
||||||
|
new_content = ""
|
||||||
|
link_replace = False
|
||||||
|
for header, section_text in section_iter(content):
|
||||||
|
if header:
|
||||||
|
new_content += header
|
||||||
|
for token_type, text in parse_cite(section_text):
|
||||||
|
if token_type == "text" and not replacement:
|
||||||
|
try:
|
||||||
|
(new_text, replacement, replaced_text) = find_link_in_chunk(
|
||||||
|
q, text, linkto=linkto
|
||||||
|
)
|
||||||
|
except LinkReplace:
|
||||||
|
link_replace = True
|
||||||
|
if replacement:
|
||||||
|
text = new_text
|
||||||
|
new_content += text
|
||||||
|
if replacement:
|
||||||
|
return (new_content, replacement, replaced_text)
|
||||||
|
raise LinkReplace if link_replace else NoMatch
|
||||||
|
|
||||||
|
|
||||||
|
def find_link_and_section(q, content, linkto=None):
|
||||||
|
if linkto:
|
||||||
|
try:
|
||||||
|
return find_link_and_section(linkto, content)
|
||||||
|
except NoMatch:
|
||||||
|
pass
|
||||||
|
sections = list(section_iter(content))
|
||||||
|
replacement = None
|
||||||
|
|
||||||
|
search_for_link = mk_link_matcher(q)
|
||||||
|
|
||||||
|
found: dict[str, str | int] = {}
|
||||||
|
|
||||||
|
for section_num, (header, section_text) in enumerate(sections):
|
||||||
|
new_content = ""
|
||||||
|
if header:
|
||||||
|
new_content += header
|
||||||
|
for token_type, text in parse_cite(section_text):
|
||||||
|
if token_type == "text" and not replacement:
|
||||||
|
new_text = ""
|
||||||
|
for token_type2, text2 in parse_links(text):
|
||||||
|
if token_type2 == "link" and not replacement:
|
||||||
|
link_text = text2[2:-2]
|
||||||
|
if "|" in link_text:
|
||||||
|
link_dest, link_text = link_text.split("|", 1)
|
||||||
|
else:
|
||||||
|
link_dest = None
|
||||||
|
m = search_for_link(link_text)
|
||||||
|
if m:
|
||||||
|
if link_dest:
|
||||||
|
found["link_dest"] = link_dest
|
||||||
|
found["link_text"] = link_text
|
||||||
|
replacement = match_found(m, q, None)
|
||||||
|
text2 = add_link(m, replacement, link_text)
|
||||||
|
new_text += text2
|
||||||
|
if replacement:
|
||||||
|
text = new_text
|
||||||
|
else:
|
||||||
|
m = search_for_link(text)
|
||||||
|
if m:
|
||||||
|
replacement = match_found(m, q, linkto)
|
||||||
|
text = add_link(m, replacement, text)
|
||||||
|
new_content += text
|
||||||
|
if replacement:
|
||||||
|
found.update(
|
||||||
|
{
|
||||||
|
"section_num": section_num,
|
||||||
|
"section_text": new_content,
|
||||||
|
"old_text": (header or "") + section_text,
|
||||||
|
"replacement": replacement,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return found
|
||||||
|
raise NoMatch
|
||||||
|
|
||||||
|
|
||||||
|
def find_refs(text: str) -> list[str]:
|
||||||
|
"""Find <ref> in wikitext."""
|
||||||
|
|
||||||
|
refs = re.findall("<ref(?:[^>]*)>(.+?)</ref>", text)
|
||||||
|
print(refs)
|
||||||
|
return refs
|
||||||
|
|
||||||
|
|
||||||
|
def new_link_is_in_ref(replacement: str, text: str) -> bool:
|
||||||
|
"""Is the new link in a <ref>."""
|
||||||
|
link = f"[[{replacement}]]"
|
||||||
|
return any(link in ref for ref in find_refs(text))
|
||||||
|
|
||||||
|
|
||||||
|
def get_match(q: str, title: str, linkto: str | None) -> dict[str, typing.Any]:
|
||||||
|
"""Get match."""
|
||||||
|
rev = get_revision_info(title)
|
||||||
|
|
||||||
|
found: dict[str, typing.Any] = find_link_and_section(q, rev["content"], linkto)
|
||||||
|
|
||||||
|
assert not new_link_is_in_ref(found["replacement"], found["section_text"])
|
||||||
|
|
||||||
|
found["revid"] = rev["revid"]
|
||||||
|
found["pageid"] = rev["pageid"]
|
||||||
|
found["section_text"] += get_subsections(rev["content"], found["section_num"])
|
||||||
|
|
||||||
|
return found
|
||||||
|
|
||||||
|
|
||||||
|
def get_diff(q: str, title: str, linkto: str | None) -> dict[str, typing.Any]:
|
||||||
|
"""Get diff."""
|
||||||
|
content, timestamp = get_content_and_timestamp(title)
|
||||||
|
found: dict[str, typing.Any] = find_link_and_section(q, content, linkto)
|
||||||
|
|
||||||
|
if new_link_is_in_ref(found["replacement"], found["section_text"]):
|
||||||
|
raise NoMatch
|
||||||
|
|
||||||
|
section_text = found["section_text"] + get_subsections(
|
||||||
|
content, found["section_num"]
|
||||||
|
)
|
||||||
|
|
||||||
|
found["diff"] = call_get_diff(title, found["section_num"], section_text)
|
||||||
|
return found
|
101
add_links/mediawiki_api.py
Normal file
101
add_links/mediawiki_api.py
Normal file
|
@ -0,0 +1,101 @@
|
||||||
|
"""Interface with the mediawiki API."""
|
||||||
|
|
||||||
|
import typing
|
||||||
|
from pprint import pprint
|
||||||
|
from typing import Any, cast
|
||||||
|
|
||||||
|
from . import wikidata_oauth
|
||||||
|
|
||||||
|
wiki_hostname = "en.wikipedia.org"
|
||||||
|
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
|
||||||
|
user_agent = "add-links/0.1"
|
||||||
|
|
||||||
|
|
||||||
|
def parse_page(enwiki: str) -> dict[str, Any]:
|
||||||
|
"""Call mediawiki parse API for given article."""
|
||||||
|
params: dict[str, str | int] = {
|
||||||
|
"action": "parse",
|
||||||
|
"format": "json",
|
||||||
|
"formatversion": 2,
|
||||||
|
"disableeditsection": 1,
|
||||||
|
"page": enwiki,
|
||||||
|
"prop": "text|links|headhtml",
|
||||||
|
"disabletoc": 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
parse: dict[str, Any] = call(params)["parse"]
|
||||||
|
return parse
|
||||||
|
|
||||||
|
|
||||||
|
def call(params: dict[str, str | int]) -> dict[str, typing.Any]:
|
||||||
|
"""Make GET request to mediawiki API."""
|
||||||
|
data = wikidata_oauth.api_post_request(params)
|
||||||
|
return cast(dict[str, Any], data.json())
|
||||||
|
|
||||||
|
|
||||||
|
def article_exists(title: str) -> bool:
|
||||||
|
"""Get article text."""
|
||||||
|
params: dict[str, str | int] = {
|
||||||
|
"action": "query",
|
||||||
|
"format": "json",
|
||||||
|
"formatversion": 2,
|
||||||
|
"titles": title,
|
||||||
|
}
|
||||||
|
return not call(params)["query"]["pages"][0].get("missing")
|
||||||
|
|
||||||
|
|
||||||
|
def get_content(title: str) -> tuple[str, int]:
|
||||||
|
"""Get article text."""
|
||||||
|
params: dict[str, str | int] = {
|
||||||
|
"action": "query",
|
||||||
|
"format": "json",
|
||||||
|
"formatversion": 2,
|
||||||
|
"prop": "revisions|info",
|
||||||
|
"rvprop": "content|timestamp|ids",
|
||||||
|
"titles": title,
|
||||||
|
}
|
||||||
|
data = call(params)
|
||||||
|
rev = data["query"]["pages"][0]["revisions"][0]
|
||||||
|
content: str = rev["content"]
|
||||||
|
revid: int = int(rev["revid"])
|
||||||
|
return content, revid
|
||||||
|
|
||||||
|
|
||||||
|
def compare(title: str, new_text: str) -> str:
|
||||||
|
"""Generate a diff for the new article text."""
|
||||||
|
params: dict[str, str | int] = {
|
||||||
|
"format": "json",
|
||||||
|
"formatversion": 2,
|
||||||
|
"action": "compare",
|
||||||
|
"fromtitle": title,
|
||||||
|
"toslots": "main",
|
||||||
|
"totext-main": new_text,
|
||||||
|
"prop": "diff",
|
||||||
|
}
|
||||||
|
diff: str = call(params)["compare"]["body"]
|
||||||
|
return diff
|
||||||
|
|
||||||
|
|
||||||
|
def edit_page(
|
||||||
|
pageid: int, section: str | int, text: str, summary: str, baserevid: str, token: str
|
||||||
|
) -> str:
|
||||||
|
"""Edit a page on Wikipedia."""
|
||||||
|
params: dict[str, str | int] = {
|
||||||
|
"format": "json",
|
||||||
|
"formatversion": 2,
|
||||||
|
"action": "edit",
|
||||||
|
"pageid": pageid,
|
||||||
|
"text": text,
|
||||||
|
"baserevid": baserevid,
|
||||||
|
"token": token,
|
||||||
|
"nocreate": 1,
|
||||||
|
"summary": summary,
|
||||||
|
"section": section,
|
||||||
|
}
|
||||||
|
ret = call(params)
|
||||||
|
if "edit" not in ret:
|
||||||
|
print("params")
|
||||||
|
pprint(params)
|
||||||
|
print()
|
||||||
|
pprint(ret)
|
||||||
|
return typing.cast(str, ret["edit"])
|
48
add_links/mediawiki_api_old.py
Normal file
48
add_links/mediawiki_api_old.py
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
"""Interface with the mediawiki API."""
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
wiki_hostname = "en.wikipedia.org"
|
||||||
|
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
|
||||||
|
user_agent = "dab-mechanic/0.1"
|
||||||
|
|
||||||
|
|
||||||
|
def parse_page(enwiki: str) -> dict[str, Any]:
|
||||||
|
"""Call mediawiki parse API for given article."""
|
||||||
|
params: dict[str, str | int] = {
|
||||||
|
"action": "parse",
|
||||||
|
"format": "json",
|
||||||
|
"formatversion": 2,
|
||||||
|
"disableeditsection": 1,
|
||||||
|
"page": enwiki,
|
||||||
|
"prop": "text|links|headhtml",
|
||||||
|
"disabletoc": 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
parse: dict[str, Any] = get(params)["parse"]
|
||||||
|
return parse
|
||||||
|
|
||||||
|
|
||||||
|
def get(params: dict[str, str | int]) -> dict[str, Any]:
|
||||||
|
"""Make GET request to mediawiki API."""
|
||||||
|
data: dict[str, Any] = requests.get(
|
||||||
|
wiki_api_php, headers={"User-Agent": user_agent}, params=params
|
||||||
|
).json()
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_content(title: str) -> str:
|
||||||
|
"""Get article text."""
|
||||||
|
params: dict[str, str | int] = {
|
||||||
|
"action": "query",
|
||||||
|
"format": "json",
|
||||||
|
"formatversion": 2,
|
||||||
|
"prop": "revisions|info",
|
||||||
|
"rvprop": "content|timestamp",
|
||||||
|
"titles": title,
|
||||||
|
}
|
||||||
|
data = get(params)
|
||||||
|
rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
|
||||||
|
return rev
|
115
add_links/util.py
Normal file
115
add_links/util.py
Normal file
|
@ -0,0 +1,115 @@
|
||||||
|
"""Util functions."""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import urllib
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
# util functions that don't access the network
|
||||||
|
|
||||||
|
namespaces = {
|
||||||
|
ns.casefold()
|
||||||
|
for ns in (
|
||||||
|
"Special",
|
||||||
|
"Media",
|
||||||
|
"Talk",
|
||||||
|
"Template",
|
||||||
|
"Portal",
|
||||||
|
"Portal talk",
|
||||||
|
"Book",
|
||||||
|
"Book talk",
|
||||||
|
"Template talk",
|
||||||
|
"Draft",
|
||||||
|
"Draft talk",
|
||||||
|
"Help",
|
||||||
|
"Help talk",
|
||||||
|
"Category",
|
||||||
|
"Category talk",
|
||||||
|
"User",
|
||||||
|
"Gadget",
|
||||||
|
"Gadget talk",
|
||||||
|
"Gadget definition",
|
||||||
|
"Gadget definition talk",
|
||||||
|
"Topic",
|
||||||
|
"User talk",
|
||||||
|
"Wikipedia",
|
||||||
|
"Education Program",
|
||||||
|
"Education Program talk",
|
||||||
|
"Wikipedia talk",
|
||||||
|
"File",
|
||||||
|
"File talk",
|
||||||
|
"TimedText",
|
||||||
|
"TimedText talk",
|
||||||
|
"MediaWiki",
|
||||||
|
"Module",
|
||||||
|
"Module talk",
|
||||||
|
"MediaWiki talk",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
re_space_or_dash = re.compile("[ -]")
|
||||||
|
|
||||||
|
|
||||||
|
def is_title_case(phrase: str) -> bool:
|
||||||
|
"""Is a given phrase is in Title Case."""
|
||||||
|
return all(
|
||||||
|
term[0].isupper() and term[1:].islower()
|
||||||
|
for term in re_space_or_dash.split(phrase)
|
||||||
|
if term and term[0].isalpha()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def urlquote(value: str) -> str:
|
||||||
|
"""Prepare string for use in URL param."""
|
||||||
|
return urllib.parse.quote_plus(value.encode("utf-8"))
|
||||||
|
|
||||||
|
|
||||||
|
def strip_parens(q: str) -> str:
|
||||||
|
"""Remove a word in parenthesis from the end of a string."""
|
||||||
|
m = re.search(r" \(.*?\)$", q)
|
||||||
|
return q[: m.start()] if m else q
|
||||||
|
|
||||||
|
|
||||||
|
def starts_with_namespace(title: str) -> bool:
|
||||||
|
"""Check if a title starts with a namespace."""
|
||||||
|
return ":" in title and title.split(":", 1)[0].casefold() in namespaces
|
||||||
|
|
||||||
|
|
||||||
|
def is_disambig(doc: dict[str, Any]) -> bool:
|
||||||
|
"""Is a this a disambiguation page."""
|
||||||
|
return any(
|
||||||
|
"disambig" in t
|
||||||
|
or t.endswith("dis")
|
||||||
|
or "given name" in t
|
||||||
|
or t == "template:surname"
|
||||||
|
for t in (t["title"].lower() for t in doc.get("templates", []))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def norm(s: str) -> str:
|
||||||
|
"""Normalise string."""
|
||||||
|
s = re.sub(r"\W", "", s).lower()
|
||||||
|
return s[:-1] if s and s[-1] == "s" else s
|
||||||
|
|
||||||
|
|
||||||
|
def case_flip(s: str) -> str:
|
||||||
|
"""Switch case of character."""
|
||||||
|
if s.islower():
|
||||||
|
return s.upper()
|
||||||
|
if s.isupper():
|
||||||
|
return s.lower()
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def case_flip_first(s: str) -> str:
|
||||||
|
"""Switch case of first character in string."""
|
||||||
|
return case_flip(s[0]) + s[1:]
|
||||||
|
|
||||||
|
|
||||||
|
def lc_alpha(s: str) -> str:
|
||||||
|
"""Lower case alphabetic characters in string."""
|
||||||
|
return "".join(c.lower() for c in s if c.isalpha())
|
||||||
|
|
||||||
|
|
||||||
|
def wiki_space_norm(s: str) -> str:
|
||||||
|
"""Normalise article title."""
|
||||||
|
return s.replace("_", " ").strip()
|
98
add_links/wikidata_oauth.py
Normal file
98
add_links/wikidata_oauth.py
Normal file
|
@ -0,0 +1,98 @@
|
||||||
|
import typing
|
||||||
|
import urllib
|
||||||
|
from typing import cast
|
||||||
|
|
||||||
|
from flask import current_app, session
|
||||||
|
from requests_oauthlib import OAuth1Session
|
||||||
|
|
||||||
|
wiki_hostname = "en.wikipedia.org"
|
||||||
|
api_url = f"https://{wiki_hostname}/w/api.php"
|
||||||
|
|
||||||
|
|
||||||
|
def get_edit_proxy() -> dict[str, str]:
|
||||||
|
"""Retrieve proxy information from config."""
|
||||||
|
edit_proxy = current_app.config.get("EDIT_PROXY")
|
||||||
|
if edit_proxy:
|
||||||
|
return {"http": edit_proxy, "https": edit_proxy}
|
||||||
|
else:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def api_post_request(params: dict[str, str | int]):
|
||||||
|
"""HTTP Post using Oauth."""
|
||||||
|
app = current_app
|
||||||
|
# url = "https://www.wikidata.org/w/api.php"
|
||||||
|
client_key = app.config["CLIENT_KEY"]
|
||||||
|
client_secret = app.config["CLIENT_SECRET"]
|
||||||
|
oauth = OAuth1Session(
|
||||||
|
client_key,
|
||||||
|
client_secret=client_secret,
|
||||||
|
resource_owner_key=session["owner_key"],
|
||||||
|
resource_owner_secret=session["owner_secret"],
|
||||||
|
)
|
||||||
|
proxies = get_edit_proxy()
|
||||||
|
return oauth.post(api_url, data=params, timeout=4, proxies=proxies)
|
||||||
|
|
||||||
|
|
||||||
|
def raw_request(params: typing.Mapping[str, str | int]):
|
||||||
|
"""Low-level API request."""
|
||||||
|
app = current_app
|
||||||
|
# url = "https://www.wikidata.org/w/api.php?" + urlencode(params)
|
||||||
|
client_key = app.config["CLIENT_KEY"]
|
||||||
|
client_secret = app.config["CLIENT_SECRET"]
|
||||||
|
oauth = OAuth1Session(
|
||||||
|
client_key,
|
||||||
|
client_secret=client_secret,
|
||||||
|
resource_owner_key=session["owner_key"],
|
||||||
|
resource_owner_secret=session["owner_secret"],
|
||||||
|
)
|
||||||
|
proxies = get_edit_proxy()
|
||||||
|
return oauth.get(
|
||||||
|
api_url + "?" + urllib.parse.urlencode(params), timeout=4, proxies=proxies
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def api_request(params: typing.Mapping[str, str | int]) -> dict[str, typing.Any]:
|
||||||
|
"""Make an API request with OAuth."""
|
||||||
|
r = raw_request(params)
|
||||||
|
try:
|
||||||
|
return cast(dict[str, typing.Any], r.json())
|
||||||
|
except Exception:
|
||||||
|
print("text")
|
||||||
|
print(r.text)
|
||||||
|
print("---")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def get_token() -> str:
|
||||||
|
"""Get CSRF tokebn from MediaWiki API."""
|
||||||
|
params: dict[str, str | int] = {
|
||||||
|
"action": "query",
|
||||||
|
"meta": "tokens",
|
||||||
|
"format": "json",
|
||||||
|
"formatversion": 2,
|
||||||
|
}
|
||||||
|
reply = api_request(params)
|
||||||
|
token: str = reply["query"]["tokens"]["csrftoken"]
|
||||||
|
|
||||||
|
return token
|
||||||
|
|
||||||
|
|
||||||
|
def userinfo_call() -> typing.Mapping[str, typing.Any]:
|
||||||
|
"""Request user information via OAuth."""
|
||||||
|
params = {"action": "query", "meta": "userinfo", "format": "json"}
|
||||||
|
return api_request(params)
|
||||||
|
|
||||||
|
|
||||||
|
def get_username() -> None | str:
|
||||||
|
"""Get the username or None if not logged in."""
|
||||||
|
if "owner_key" not in session:
|
||||||
|
return None # not authorized
|
||||||
|
|
||||||
|
if "username" not in session:
|
||||||
|
reply = userinfo_call()
|
||||||
|
if "query" not in reply:
|
||||||
|
return None
|
||||||
|
session["username"] = reply["query"]["userinfo"]["name"]
|
||||||
|
|
||||||
|
return cast(str, session["username"])
|
201
add_links/wikipedia.py
Normal file
201
add_links/wikipedia.py
Normal file
|
@ -0,0 +1,201 @@
|
||||||
|
from collections import defaultdict
|
||||||
|
from typing import Any, Iterator, Optional, TypedDict
|
||||||
|
|
||||||
|
import flask
|
||||||
|
import lxml.html
|
||||||
|
|
||||||
|
from . import mediawiki_api
|
||||||
|
|
||||||
|
disambig_templates = [
|
||||||
|
"Template:Disambiguation",
|
||||||
|
"Template:Airport disambiguation",
|
||||||
|
"Template:Biology disambiguation",
|
||||||
|
"Template:Call sign disambiguation",
|
||||||
|
"Template:Caselaw disambiguation",
|
||||||
|
"Template:Chinese title disambiguation",
|
||||||
|
"Template:Disambiguation cleanup",
|
||||||
|
"Template:Genus disambiguation",
|
||||||
|
"Template:Hospital disambiguation",
|
||||||
|
"Template:Human name disambiguation",
|
||||||
|
"Template:Human name disambiguation cleanup",
|
||||||
|
"Template:Letter-number combination disambiguation",
|
||||||
|
"Template:Mathematical disambiguation",
|
||||||
|
"Template:Military unit disambiguation",
|
||||||
|
"Template:Music disambiguation",
|
||||||
|
"Template:Number disambiguation",
|
||||||
|
"Template:Opus number disambiguation",
|
||||||
|
"Template:Phonetics disambiguation",
|
||||||
|
"Template:Place name disambiguation",
|
||||||
|
"Template:Portal disambiguation",
|
||||||
|
"Template:Road disambiguation",
|
||||||
|
"Template:School disambiguation",
|
||||||
|
"Template:Species Latin name abbreviation disambiguation",
|
||||||
|
"Template:Species Latin name disambiguation",
|
||||||
|
"Template:Station disambiguation",
|
||||||
|
"Template:Synagogue disambiguation",
|
||||||
|
"Template:Taxonomic authority disambiguation",
|
||||||
|
"Template:Taxonomy disambiguation",
|
||||||
|
"Template:Template disambiguation",
|
||||||
|
"Template:WoO number disambiguation",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def link_params(enwiki: str) -> dict[str, str | int]:
|
||||||
|
"""Parameters for finding article links from the API."""
|
||||||
|
params: dict[str, str | int] = {
|
||||||
|
"action": "query",
|
||||||
|
"format": "json",
|
||||||
|
"formatversion": 2,
|
||||||
|
"titles": enwiki,
|
||||||
|
"generator": "links",
|
||||||
|
"gpllimit": "max",
|
||||||
|
"gplnamespace": 0,
|
||||||
|
"tllimit": "max",
|
||||||
|
"redirects": 1,
|
||||||
|
"tlnamespace": 10,
|
||||||
|
"tltemplates": "|".join(disambig_templates),
|
||||||
|
"prop": "templates",
|
||||||
|
}
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
def needs_disambig(link: dict[str, Any]) -> bool:
|
||||||
|
"""Is this a disambiguation link."""
|
||||||
|
return bool(
|
||||||
|
not link["title"].endswith(" (disambiguation)") and link.get("templates")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_article_links(enwiki: str) -> list[str]:
|
||||||
|
"""Get links that appear in this article."""
|
||||||
|
|
||||||
|
params: dict[str, str | int] = link_params(enwiki)
|
||||||
|
links: set[str] = set()
|
||||||
|
|
||||||
|
redirects = defaultdict(set)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
data = mediawiki_api.get(params)
|
||||||
|
pages = data["query"].pop("pages")
|
||||||
|
for r in data["query"].pop("redirects"):
|
||||||
|
redirects[r["to"]].add(r["from"])
|
||||||
|
|
||||||
|
links.update(page["title"] for page in pages if needs_disambig(page))
|
||||||
|
|
||||||
|
if "continue" not in data:
|
||||||
|
break
|
||||||
|
|
||||||
|
params["gplcontinue"] = data["continue"]["gplcontinue"]
|
||||||
|
|
||||||
|
for link in set(links):
|
||||||
|
if link in redirects:
|
||||||
|
links.update(redirects[link])
|
||||||
|
|
||||||
|
return list(links)
|
||||||
|
|
||||||
|
# return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
|
||||||
|
|
||||||
|
|
||||||
|
def get_article_html(enwiki: str) -> str:
|
||||||
|
"""Parse article wikitext and return HTML."""
|
||||||
|
text: str = mediawiki_api.parse_page(enwiki)["text"]
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
class DabItem(TypedDict):
|
||||||
|
"""Represent a disabiguation page."""
|
||||||
|
|
||||||
|
num: int
|
||||||
|
title: str
|
||||||
|
html: str
|
||||||
|
|
||||||
|
|
||||||
|
def delete_toc(root: lxml.html.HtmlElement) -> None:
|
||||||
|
"""Delete table of contents from article HTML."""
|
||||||
|
for toc in root.findall(".//div[@class='toc']"):
|
||||||
|
toc.getparent().remove(toc)
|
||||||
|
|
||||||
|
|
||||||
|
def get_dab_html(dab_num: int, title: str) -> str:
|
||||||
|
"""Parse dab page and rewrite links."""
|
||||||
|
dab_html = get_article_html(title)
|
||||||
|
root = lxml.html.fromstring(dab_html)
|
||||||
|
delete_toc(root)
|
||||||
|
|
||||||
|
element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")}
|
||||||
|
|
||||||
|
for a in root.findall(".//a[@href]"):
|
||||||
|
href: str | None = a.get("href")
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
if not href.startswith("#"):
|
||||||
|
a.set("href", "#")
|
||||||
|
a.set("onclick", f"return select_dab(this, {dab_num})")
|
||||||
|
continue
|
||||||
|
|
||||||
|
destination_element = element_id_map[href[1:]]
|
||||||
|
assert destination_element is not None
|
||||||
|
destination_element.set("id", f"{dab_num}{href[1:]}")
|
||||||
|
a.set("href", f"#{dab_num}{href[1:]}")
|
||||||
|
|
||||||
|
html: str = lxml.html.tostring(root, encoding=str)
|
||||||
|
return html
|
||||||
|
|
||||||
|
|
||||||
|
class Article:
|
||||||
|
"""Current article we're working on."""
|
||||||
|
|
||||||
|
def __init__(self, enwiki: str) -> None:
|
||||||
|
"""Make a new Article object."""
|
||||||
|
self.enwiki = enwiki.replace("_", " ")
|
||||||
|
|
||||||
|
self.links = get_article_links(enwiki)
|
||||||
|
|
||||||
|
self.dab_list: list[DabItem] = []
|
||||||
|
self.dab_lookup: dict[int, str] = {}
|
||||||
|
self.dab_order: list[str] = []
|
||||||
|
self.parse: Optional[dict[str, Any]] = None
|
||||||
|
|
||||||
|
def save_endpoint(self) -> str:
|
||||||
|
"""Endpoint for saving changes."""
|
||||||
|
href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_"))
|
||||||
|
return href
|
||||||
|
|
||||||
|
def load(self) -> None:
|
||||||
|
"""Load parsed article HTML."""
|
||||||
|
self.parse = mediawiki_api.parse_page(self.enwiki)
|
||||||
|
self.root = lxml.html.fromstring(self.parse.pop("text"))
|
||||||
|
|
||||||
|
def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
|
||||||
|
"""Disambiguation links that need fixing."""
|
||||||
|
seen = set()
|
||||||
|
for a in self.root.findall(".//a[@href]"):
|
||||||
|
title = a.get("title")
|
||||||
|
if title is None or title not in self.links:
|
||||||
|
continue
|
||||||
|
a.set("class", "disambig")
|
||||||
|
|
||||||
|
if title in seen:
|
||||||
|
continue
|
||||||
|
seen.add(title)
|
||||||
|
|
||||||
|
yield a, title
|
||||||
|
|
||||||
|
def process_links(self) -> None:
|
||||||
|
"""Process links in parsed wikitext."""
|
||||||
|
for dab_num, (a, title) in enumerate(self.iter_links()):
|
||||||
|
a.set("id", f"dab-{dab_num}")
|
||||||
|
|
||||||
|
dab: DabItem = {
|
||||||
|
"num": dab_num,
|
||||||
|
"title": title,
|
||||||
|
"html": get_dab_html(dab_num, title),
|
||||||
|
}
|
||||||
|
self.dab_list.append(dab)
|
||||||
|
self.dab_order.append(title)
|
||||||
|
self.dab_lookup[dab_num] = title
|
||||||
|
|
||||||
|
def get_html(self) -> str:
|
||||||
|
"""Return the processed article HTML."""
|
||||||
|
html: str = lxml.html.tostring(self.root, encoding=str)
|
||||||
|
return html
|
114
cmdline.py
Executable file
114
cmdline.py
Executable file
|
@ -0,0 +1,114 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import collections
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import typing
|
||||||
|
|
||||||
|
from add_links import api
|
||||||
|
|
||||||
|
# from_title = sys.argv[1]
|
||||||
|
|
||||||
|
re_disambig = re.compile(r"^(.*) \((.*)\)$")
|
||||||
|
|
||||||
|
|
||||||
|
def article_title_to_search_query(title: str) -> str:
|
||||||
|
"""Convert from article title to search query string."""
|
||||||
|
m = re_disambig.match(title)
|
||||||
|
return f'"{m.group(1)}" AND "{m.group(2)}"' if m else f'"{title}"'
|
||||||
|
|
||||||
|
|
||||||
|
def run_search(q: str, limit: int | str = "max") -> dict[str, typing.Any]:
|
||||||
|
"""Search Wikipedia."""
|
||||||
|
params = {"list": "search", "srwhat": "text", "srlimit": limit, "srsearch": q}
|
||||||
|
return typing.cast(dict[str, typing.Any], api.api_get(params)["query"])
|
||||||
|
|
||||||
|
|
||||||
|
def search_no_link(q: str) -> tuple[int, list[dict[str, str | int]]]:
|
||||||
|
"""Search for mentions of article title with no link included."""
|
||||||
|
query = run_search(article_title_to_search_query(q) + f' -linksto:"{q}"', "max")
|
||||||
|
totalhits = query["searchinfo"]["totalhits"]
|
||||||
|
results = query["search"]
|
||||||
|
return (totalhits, results)
|
||||||
|
|
||||||
|
|
||||||
|
def search_count(q: str) -> int:
|
||||||
|
"""How often does this article title appear in Wikipedia."""
|
||||||
|
query = run_search(article_title_to_search_query(q), limit=0)
|
||||||
|
return typing.cast(int, query["searchinfo"]["totalhits"]) - 1
|
||||||
|
|
||||||
|
|
||||||
|
def search_count_with_link(q: str) -> int:
|
||||||
|
"""How often does this article title appear in Wikipedia."""
|
||||||
|
query = run_search(article_title_to_search_query(q) + f' linksto:"{q}"', limit=0)
|
||||||
|
return typing.cast(int, query["searchinfo"]["totalhits"])
|
||||||
|
|
||||||
|
|
||||||
|
def parse_contribs() -> list[tuple[str, int]]:
|
||||||
|
re_comment = re.compile(r"^link \[\[(.*)\]\] using")
|
||||||
|
|
||||||
|
links: collections.Counter[str] = collections.Counter()
|
||||||
|
|
||||||
|
for line in open("../wikipedia-contribs/contribs"):
|
||||||
|
if (
|
||||||
|
'"comment": "link ' not in line
|
||||||
|
or "using [[User:Edward/Find link|Find link]]" not in line
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
comment = json.loads(line)["comment"]
|
||||||
|
|
||||||
|
m = re_comment.match(comment)
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
link = m.group(1)
|
||||||
|
|
||||||
|
if "|" not in link:
|
||||||
|
links[link] += 1
|
||||||
|
|
||||||
|
return links.most_common(200)
|
||||||
|
|
||||||
|
|
||||||
|
with open("examples") as f:
|
||||||
|
seen = {json.loads(line)["title"] for line in f}
|
||||||
|
|
||||||
|
|
||||||
|
out = open("examples", "a")
|
||||||
|
for from_title, num in parse_contribs():
|
||||||
|
if from_title in seen:
|
||||||
|
continue
|
||||||
|
count = search_count(from_title)
|
||||||
|
count_with_link = search_count_with_link(from_title)
|
||||||
|
ratio = float(count_with_link) / float(count)
|
||||||
|
|
||||||
|
print(from_title, count, count_with_link, f"{ratio:.1%}")
|
||||||
|
print(
|
||||||
|
json.dumps(
|
||||||
|
{"title": from_title, "total": count, "with_links": count_with_link}
|
||||||
|
),
|
||||||
|
file=out,
|
||||||
|
)
|
||||||
|
out.flush()
|
||||||
|
time.sleep(0.1)
|
||||||
|
out.close()
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
count = search_count(from_title)
|
||||||
|
count_with_link = search_count_with_link(from_title)
|
||||||
|
ratio = float(count_with_link) / float(count)
|
||||||
|
|
||||||
|
print(count, count_with_link, f"{ratio:.1%}")
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
totalhits, search_hits = search_no_link(from_title)
|
||||||
|
|
||||||
|
for hit in search_hits:
|
||||||
|
print(" ", hit)
|
||||||
|
print(count, count_with_link, f"{ratio:.1%}", totalhits, len(search_hits))
|
||||||
|
|
||||||
|
# ret = core.do_search(from_title)
|
||||||
|
# print(ret)
|
23
frontend/.eslintrc.js
Normal file
23
frontend/.eslintrc.js
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
module.exports = {
|
||||||
|
"env": {
|
||||||
|
"browser": true,
|
||||||
|
"es6": true
|
||||||
|
},
|
||||||
|
"extends": [
|
||||||
|
"plugin:vue/essential",
|
||||||
|
"standard"
|
||||||
|
],
|
||||||
|
"globals": {
|
||||||
|
"Atomics": "readonly",
|
||||||
|
"SharedArrayBuffer": "readonly"
|
||||||
|
},
|
||||||
|
"parserOptions": {
|
||||||
|
"ecmaVersion": 14,
|
||||||
|
"sourceType": "module"
|
||||||
|
},
|
||||||
|
"plugins": [
|
||||||
|
"vue"
|
||||||
|
],
|
||||||
|
"rules": {
|
||||||
|
}
|
||||||
|
};
|
55
frontend/App.vue
Normal file
55
frontend/App.vue
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
<template>
|
||||||
|
Hello world: {{ title }}
|
||||||
|
|
||||||
|
<div v-for="hit in this.hits" class="mt-3">
|
||||||
|
<div><strong>{{ hit.title }}</strong> ({{ hit.wordcount }} words)</div>
|
||||||
|
<div v-html="hit.snippet"></div>
|
||||||
|
<table v-html="hit.diff"></table>
|
||||||
|
<div>replacement: {{ hit.replacement }}</div>
|
||||||
|
</div>
|
||||||
|
</template>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
import axios from "redaxios";
|
||||||
|
|
||||||
|
export default {
|
||||||
|
props: {
|
||||||
|
title: String,
|
||||||
|
api_base_url: String,
|
||||||
|
},
|
||||||
|
data() {
|
||||||
|
return {
|
||||||
|
hits: [],
|
||||||
|
};
|
||||||
|
},
|
||||||
|
computed: {
|
||||||
|
},
|
||||||
|
watch: {
|
||||||
|
},
|
||||||
|
methods: {
|
||||||
|
api_call(endpoint, options) {
|
||||||
|
var url = `${this.api_base_url}/${endpoint}`;
|
||||||
|
return axios.get(url, options).catch(this.show_api_error_modal);
|
||||||
|
},
|
||||||
|
add_hit(hit) {
|
||||||
|
var params = { link_from: hit.title, link_to: this.title };
|
||||||
|
this.api_call("valid_hit", { params: params}).then((response) => {
|
||||||
|
if (response.data.valid) {
|
||||||
|
hit.diff = response.data.diff
|
||||||
|
hit.replacement = response.data.replacement
|
||||||
|
this.hits.push(hit);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
},
|
||||||
|
mounted() {
|
||||||
|
var params = { title: this.title }
|
||||||
|
this.api_call("hits", { params: params}).then((response) => {
|
||||||
|
response.data.hits.forEach((hit) => { this.add_hit(hit) });
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
</style>
|
7
frontend/entry.js
Normal file
7
frontend/entry.js
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
import {createApp} from 'vue';
|
||||||
|
import App from './App.vue';
|
||||||
|
|
||||||
|
export default function(props) {
|
||||||
|
const app = createApp(App, props).mount('#app');
|
||||||
|
return app;
|
||||||
|
}
|
18
package.json
Normal file
18
package.json
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
{
|
||||||
|
"name": "add-links",
|
||||||
|
"version": "0.0.1",
|
||||||
|
"scripts": {
|
||||||
|
"dev": "vite",
|
||||||
|
"build": "vite build"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"bootstrap": "^5.2.3",
|
||||||
|
"vue": "^3.3.4"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@vitejs/plugin-vue": "^4.2.3",
|
||||||
|
"eslint": "^8.41.0",
|
||||||
|
"eslint-plugin-vue": "^9.13.0",
|
||||||
|
"vite": "^4.3.8"
|
||||||
|
}
|
||||||
|
}
|
10
templates/all_done.html
Normal file
10
templates/all_done.html
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
{% extends "base.html" %}
|
||||||
|
|
||||||
|
{% block title %}Index{% endblock %}
|
||||||
|
|
||||||
|
{% block content %}
|
||||||
|
<div class="container">
|
||||||
|
<h1>All done</h1>
|
||||||
|
<div><a href="{{ url_for('index') }}">back to index </a></div>
|
||||||
|
</div>
|
||||||
|
{% endblock %}
|
56
templates/article.html
Normal file
56
templates/article.html
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
{% extends "base.html" %}
|
||||||
|
|
||||||
|
{% block title %}{{ title }}{% endblock %}
|
||||||
|
|
||||||
|
{% block style %}
|
||||||
|
<style>
|
||||||
|
|
||||||
|
span.exact { padding: 2px; background: green; color: white; font-weight: bold; }
|
||||||
|
span.nomatch { padding: 2px; background: red; color: white; font-weight: bold; }
|
||||||
|
span.case_mismatch { padding: 2px; background: orange; color: white; font-weight: bold; }
|
||||||
|
span.searchmatch { font-weight: bold; }
|
||||||
|
|
||||||
|
table.diff,td.diff-otitle,td.diff-ntitle{background-color:white}
|
||||||
|
td.diff-otitle,td.diff-ntitle{text-align:center}
|
||||||
|
td.diff-marker{text-align:right;font-weight:bold;font-size:1.25em}
|
||||||
|
td.diff-lineno{font-weight:bold}
|
||||||
|
td.diff-addedline,td.diff-deletedline,td.diff-context{font-size:88%;vertical-align:top;white-space:-moz-pre-wrap;white-space:pre-wrap}
|
||||||
|
td.diff-addedline,td.diff-deletedline{border-style:solid;border-width:1px 1px 1px 4px;border-radius:0.33em}
|
||||||
|
td.diff-addedline{border-color:#a3d3ff}
|
||||||
|
td.diff-deletedline{border-color:#ffe49c}
|
||||||
|
td.diff-context{background:#f3f3f3;color:#333333;border-style:solid;border-width:1px 1px 1px 4px;border-color:#e6e6e6;border-radius:0.33em}
|
||||||
|
.diffchange{font-weight:bold;text-decoration:none}
|
||||||
|
table.diff{border:none;width:98%;border-spacing:4px; table-layout:fixed}
|
||||||
|
td.diff-addedline .diffchange,td.diff-deletedline .diffchange{border-radius:0.33em;padding:0.25em 0}
|
||||||
|
td.diff-addedline .diffchange{background:#d8ecff}
|
||||||
|
td.diff-deletedline .diffchange{background:#feeec8}
|
||||||
|
table.diff td{padding:0.33em 0.66em}
|
||||||
|
table.diff col.diff-marker{width:2%}
|
||||||
|
table.diff col.diff-content{width:48%}
|
||||||
|
table.diff td div{ word-wrap:break-word; overflow:auto}
|
||||||
|
</style>
|
||||||
|
{% endblock %}
|
||||||
|
|
||||||
|
{% block content %}
|
||||||
|
<div class="container">
|
||||||
|
<h1>{{ self.title() }}</h1>
|
||||||
|
<form>
|
||||||
|
<input name="q">
|
||||||
|
<input type="submit" value="search">
|
||||||
|
</form>
|
||||||
|
<div id="app"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script type="module">
|
||||||
|
import main from {{ url_for('static', filename='add_links.es.js') | tojson }};
|
||||||
|
const props = {
|
||||||
|
title: {{ title | tojson }},
|
||||||
|
api_base_url: "/api/1"
|
||||||
|
}
|
||||||
|
main(props);
|
||||||
|
</script>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
{% endblock %}
|
||||||
|
|
66
templates/article2.html
Normal file
66
templates/article2.html
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
{% extends "base.html" %}
|
||||||
|
|
||||||
|
{% block title %}{{ title }}{% endblock %}
|
||||||
|
|
||||||
|
{% block style %}
|
||||||
|
<style>
|
||||||
|
|
||||||
|
span.exact { padding: 2px; background: green; color: white; font-weight: bold; }
|
||||||
|
span.nomatch { padding: 2px; background: red; color: white; font-weight: bold; }
|
||||||
|
span.case_mismatch { padding: 2px; background: orange; color: white; font-weight: bold; }
|
||||||
|
span.searchmatch { font-weight: bold; }
|
||||||
|
|
||||||
|
table.diff,td.diff-otitle,td.diff-ntitle{background-color:white}
|
||||||
|
td.diff-otitle,td.diff-ntitle{text-align:center}
|
||||||
|
td.diff-marker{text-align:right;font-weight:bold;font-size:1.25em}
|
||||||
|
td.diff-lineno{font-weight:bold}
|
||||||
|
td.diff-addedline,td.diff-deletedline,td.diff-context{font-size:88%;vertical-align:top;white-space:-moz-pre-wrap;white-space:pre-wrap}
|
||||||
|
td.diff-addedline,td.diff-deletedline{border-style:solid;border-width:1px 1px 1px 4px;border-radius:0.33em}
|
||||||
|
td.diff-addedline{border-color:#a3d3ff}
|
||||||
|
td.diff-deletedline{border-color:#ffe49c}
|
||||||
|
td.diff-context{background:#f3f3f3;color:#333333;border-style:solid;border-width:1px 1px 1px 4px;border-color:#e6e6e6;border-radius:0.33em}
|
||||||
|
.diffchange{font-weight:bold;text-decoration:none}
|
||||||
|
table.diff{border:none;width:98%;border-spacing:4px; table-layout:fixed}
|
||||||
|
td.diff-addedline .diffchange,td.diff-deletedline .diffchange{border-radius:0.33em;padding:0.25em 0}
|
||||||
|
td.diff-addedline .diffchange{background:#d8ecff}
|
||||||
|
td.diff-deletedline .diffchange{background:#feeec8}
|
||||||
|
table.diff td{padding:0.33em 0.66em}
|
||||||
|
table.diff col.diff-marker{width:2%}
|
||||||
|
table.diff col.diff-content{width:48%}
|
||||||
|
table.diff td div{ word-wrap:break-word; overflow:auto}
|
||||||
|
</style>
|
||||||
|
{% endblock %}
|
||||||
|
|
||||||
|
{% block content %}
|
||||||
|
<div class="container">
|
||||||
|
<h1>{{ self.title() }}</h1>
|
||||||
|
<form action="{{ url_for("index") }}">
|
||||||
|
<input name="q">
|
||||||
|
<input type="submit" value="search">
|
||||||
|
</form>
|
||||||
|
|
||||||
|
<div>Username: {{ g.user }}</div>
|
||||||
|
|
||||||
|
<div><a href="https://en.wikipedia.org/wiki/{{ title }}" target="_blank">view article</a></div>
|
||||||
|
|
||||||
|
<div><a href="{{ url_for('index') }}">back to index </a></div>
|
||||||
|
|
||||||
|
<div>total: {{ total }}</div>
|
||||||
|
<div>with link: {{ with_link }}</div>
|
||||||
|
<div>ratio: {{ "{:.1%}".format(with_link / total) }}</div>
|
||||||
|
<div>hit: {{ hit }}</div>
|
||||||
|
<div>replacement: {{ found.replacement }}</div>
|
||||||
|
<div>section: {{ found.section }}</div>
|
||||||
|
<table>
|
||||||
|
{{ diff | safe }}
|
||||||
|
</table>
|
||||||
|
<form method="POST">
|
||||||
|
<input type="hidden" name="hit" value="{{ hit.title }}">
|
||||||
|
<div class="my-3">
|
||||||
|
<input type="submit" class="btn btn-primary" value="save"/>
|
||||||
|
<a href="{{url_for("article_page", url_title=url_title, after=hit["title"])}}" class="btn btn-primary">skip</a>
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
{% endblock %}
|
||||||
|
|
22
templates/base.html
Normal file
22
templates/base.html
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<link href="{{ url_for("static", filename="bootstrap/css/bootstrap.min.css") }}" rel="stylesheet">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
|
|
||||||
|
<title>
|
||||||
|
{% block title %}{% endblock %}
|
||||||
|
</title>
|
||||||
|
|
||||||
|
{% block style %}{% endblock %}
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
{% block content %}{% endblock %}
|
||||||
|
|
||||||
|
<script src="{{ url_for("static", filename="bootstrap/js/bootstrap.bundle.min.js")}}></script>
|
||||||
|
|
||||||
|
{% block script %}{% endblock %}
|
||||||
|
</body>
|
||||||
|
</html>
|
25
templates/index.html
Normal file
25
templates/index.html
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
{% extends "base.html" %}
|
||||||
|
|
||||||
|
{% block title %}Index{% endblock %}
|
||||||
|
|
||||||
|
{% block content %}
|
||||||
|
<div class="container">
|
||||||
|
<h1>Index</h1>
|
||||||
|
<form>
|
||||||
|
<input name="q">
|
||||||
|
<input type="submit" value="search">
|
||||||
|
</form>
|
||||||
|
|
||||||
|
<div>Username: {{ g.user }}</div>
|
||||||
|
|
||||||
|
<table class="table w-auto">
|
||||||
|
{% for item in examples %}
|
||||||
|
<tr>
|
||||||
|
<td><a href="{{ article_url(item.title) }}">{{ item.title }}</a></td>
|
||||||
|
<td>{{ item.total }}</td>
|
||||||
|
<td>{{ "{:.1%}".format(item.with_links / item.total) }}</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
{% endblock %}
|
10
templates/save_done.html
Normal file
10
templates/save_done.html
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
{% extends "base.html" %}
|
||||||
|
|
||||||
|
{% block title %}Index{% endblock %}
|
||||||
|
|
||||||
|
{% block content %}
|
||||||
|
<div class="container">
|
||||||
|
<h1>Save done</h1>
|
||||||
|
<div>Save is complete.</div>
|
||||||
|
</div>
|
||||||
|
{% endblock %}
|
17
vite.config.js
Normal file
17
vite.config.js
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
import { defineConfig } from 'vite'
|
||||||
|
import vue from '@vitejs/plugin-vue'
|
||||||
|
import path from 'path'
|
||||||
|
|
||||||
|
export default defineConfig({
|
||||||
|
plugins: [vue()],
|
||||||
|
define: {
|
||||||
|
'process.env.NODE_ENV': JSON.stringify('production'),
|
||||||
|
},
|
||||||
|
build: {
|
||||||
|
lib: {
|
||||||
|
entry: path.resolve(__dirname, 'frontend/entry.js'),
|
||||||
|
name: 'AddLinks',
|
||||||
|
fileName: (format) => `add_links.${format}.js`,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
362
web_view.py
Executable file
362
web_view.py
Executable file
|
@ -0,0 +1,362 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import html
|
||||||
|
import itertools
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import typing
|
||||||
|
|
||||||
|
import flask
|
||||||
|
import werkzeug
|
||||||
|
from requests_oauthlib import OAuth1Session
|
||||||
|
from werkzeug.wrappers.response import Response
|
||||||
|
|
||||||
|
from add_links import api, core, mediawiki_api, wikidata_oauth
|
||||||
|
from add_links.match import NoMatch, get_diff, get_match
|
||||||
|
|
||||||
|
app = flask.Flask(__name__)
|
||||||
|
app.config.from_object("config.default")
|
||||||
|
app.debug = True
|
||||||
|
|
||||||
|
wiki_hostname = "en.wikipedia.org"
|
||||||
|
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
|
||||||
|
wiki_index_php = f"https://{wiki_hostname}/w/index.php"
|
||||||
|
|
||||||
|
|
||||||
|
class Hit(typing.TypedDict):
|
||||||
|
"""Candidate articles."""
|
||||||
|
|
||||||
|
ns: int
|
||||||
|
title: str
|
||||||
|
pageid: int
|
||||||
|
size: int
|
||||||
|
wordcount: int
|
||||||
|
snippet: str
|
||||||
|
timestamp: str
|
||||||
|
|
||||||
|
|
||||||
|
re_disambig = re.compile(r"^(.*) \((.*)\)$")
|
||||||
|
|
||||||
|
|
||||||
|
def load_examples() -> list[dict[str, str | int]]:
|
||||||
|
"""Load examples."""
|
||||||
|
return [json.loads(line) for line in open("examples")]
|
||||||
|
|
||||||
|
|
||||||
|
def article_title_to_search_query(title: str) -> str:
|
||||||
|
"""Convert from article title to search query string."""
|
||||||
|
m = re_disambig.match(title)
|
||||||
|
return f'"{m.group(1)}" AND "{m.group(2)}"' if m else f'"{title}"'
|
||||||
|
|
||||||
|
|
||||||
|
def run_search(q: str, limit: int | str = "max") -> dict[str, typing.Any]:
|
||||||
|
"""Search Wikipedia."""
|
||||||
|
params = {"list": "search", "srwhat": "text", "srlimit": limit, "srsearch": q}
|
||||||
|
return typing.cast(dict[str, typing.Any], api.api_get(params)["query"])
|
||||||
|
|
||||||
|
|
||||||
|
def article_url(title: str) -> str:
|
||||||
|
"""URL for search page."""
|
||||||
|
return flask.url_for("article_page", url_title=title.replace(" ", "_"))
|
||||||
|
|
||||||
|
|
||||||
|
def search_count(q: str) -> int:
|
||||||
|
"""How often does this article title appear in Wikipedia."""
|
||||||
|
query = run_search(article_title_to_search_query(q), limit=0)
|
||||||
|
return typing.cast(int, query["searchinfo"]["totalhits"]) - 1
|
||||||
|
|
||||||
|
|
||||||
|
def search_count_with_link(q: str) -> int:
|
||||||
|
"""How often does this article title appear in Wikipedia."""
|
||||||
|
query = run_search(article_title_to_search_query(q) + f' linksto:"{q}"', limit=0)
|
||||||
|
return typing.cast(int, query["searchinfo"]["totalhits"])
|
||||||
|
|
||||||
|
|
||||||
|
def search_no_link(q: str) -> tuple[int, list[Hit]]:
|
||||||
|
"""Search for mentions of article title with no link included."""
|
||||||
|
query = run_search(article_title_to_search_query(q) + f' -linksto:"{q}"', "max")
|
||||||
|
totalhits = query["searchinfo"]["totalhits"]
|
||||||
|
results = query["search"]
|
||||||
|
return (totalhits, results)
|
||||||
|
|
||||||
|
|
||||||
|
@app.before_request
|
||||||
|
def global_user() -> None:
|
||||||
|
"""Make username available everywhere."""
|
||||||
|
flask.g.user = wikidata_oauth.get_username()
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/")
|
||||||
|
def index() -> str | Response:
|
||||||
|
"""Index page."""
|
||||||
|
if "oauth_verifier" in flask.request.args and "oauth_token" in flask.request.args:
|
||||||
|
return flask.redirect(flask.url_for("oauth_callback", **flask.request.args))
|
||||||
|
|
||||||
|
examples = load_examples()
|
||||||
|
examples.sort(
|
||||||
|
key=lambda i: float(i["with_links"]) / float(i["total"]), reverse=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if q := flask.request.args.get("q"):
|
||||||
|
if q_trimmed := q.strip():
|
||||||
|
return flask.redirect(article_url(q_trimmed))
|
||||||
|
|
||||||
|
return flask.render_template(
|
||||||
|
"index.html", examples=examples, article_url=article_url
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def case_flip(s: str) -> str:
|
||||||
|
"""Switch case of character."""
|
||||||
|
if s.islower():
|
||||||
|
return s.upper()
|
||||||
|
if s.isupper():
|
||||||
|
return s.lower()
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def case_flip_first(s: str) -> str:
|
||||||
|
"""Switch case of first character in string."""
|
||||||
|
return case_flip(s[0]) + s[1:]
|
||||||
|
|
||||||
|
|
||||||
|
def tidy_snippet(snippet: str) -> str:
|
||||||
|
"""Remove HTML from snippet."""
|
||||||
|
snippet = snippet.replace("\u2013", "-")
|
||||||
|
snippet = snippet.replace("</span>", "")
|
||||||
|
snippet = snippet.replace('<span class="searchmatch">', "")
|
||||||
|
return html.unescape(snippet)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/oauth/start")
|
||||||
|
def start_oauth() -> Response:
|
||||||
|
"""Start OAuth."""
|
||||||
|
next_page = flask.request.args.get("next")
|
||||||
|
if next_page:
|
||||||
|
flask.session["after_login"] = next_page
|
||||||
|
|
||||||
|
client_key = app.config["CLIENT_KEY"]
|
||||||
|
client_secret = app.config["CLIENT_SECRET"]
|
||||||
|
request_token_url = wiki_index_php + "?title=Special%3aOAuth%2finitiate"
|
||||||
|
|
||||||
|
oauth = OAuth1Session(client_key, client_secret=client_secret, callback_uri="oob")
|
||||||
|
fetch_response = oauth.fetch_request_token(request_token_url)
|
||||||
|
|
||||||
|
flask.session["owner_key"] = fetch_response.get("oauth_token")
|
||||||
|
flask.session["owner_secret"] = fetch_response.get("oauth_token_secret")
|
||||||
|
|
||||||
|
base_authorization_url = f"https://{wiki_hostname}/wiki/Special:OAuth/authorize"
|
||||||
|
authorization_url = oauth.authorization_url(
|
||||||
|
base_authorization_url, oauth_consumer_key=client_key
|
||||||
|
)
|
||||||
|
return flask.redirect(authorization_url)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/oauth/callback", methods=["GET"])
|
||||||
|
def oauth_callback() -> werkzeug.wrappers.response.Response:
|
||||||
|
"""Oauth callback."""
|
||||||
|
client_key = app.config["CLIENT_KEY"]
|
||||||
|
client_secret = app.config["CLIENT_SECRET"]
|
||||||
|
|
||||||
|
oauth = OAuth1Session(
|
||||||
|
client_key,
|
||||||
|
client_secret=client_secret,
|
||||||
|
resource_owner_key=flask.session["owner_key"],
|
||||||
|
resource_owner_secret=flask.session["owner_secret"],
|
||||||
|
)
|
||||||
|
|
||||||
|
oauth_response = oauth.parse_authorization_response(flask.request.url)
|
||||||
|
verifier = oauth_response.get("oauth_verifier")
|
||||||
|
access_token_url = wiki_index_php + "?title=Special%3aOAuth%2ftoken"
|
||||||
|
oauth = OAuth1Session(
|
||||||
|
client_key,
|
||||||
|
client_secret=client_secret,
|
||||||
|
resource_owner_key=flask.session["owner_key"],
|
||||||
|
resource_owner_secret=flask.session["owner_secret"],
|
||||||
|
verifier=verifier,
|
||||||
|
)
|
||||||
|
|
||||||
|
oauth_tokens = oauth.fetch_access_token(access_token_url)
|
||||||
|
flask.session["owner_key"] = oauth_tokens.get("oauth_token")
|
||||||
|
flask.session["owner_secret"] = oauth_tokens.get("oauth_token_secret")
|
||||||
|
|
||||||
|
print("login successful")
|
||||||
|
|
||||||
|
next_page = flask.session.get("after_login")
|
||||||
|
return flask.redirect(next_page if next_page else flask.url_for("index"))
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/oauth/disconnect")
|
||||||
|
def oauth_disconnect() -> werkzeug.wrappers.response.Response:
|
||||||
|
"""Disconnect OAuth."""
|
||||||
|
for key in "owner_key", "owner_secret", "username", "after_login":
|
||||||
|
if key in flask.session:
|
||||||
|
del flask.session[key]
|
||||||
|
return flask.redirect(flask.url_for("index"))
|
||||||
|
|
||||||
|
|
||||||
|
def match_type(q: str, snippet: str) -> str | None:
|
||||||
|
"""Discover match type, ''exact', 'case_mismatch' or None.
|
||||||
|
|
||||||
|
>>> match_type('foo', 'foo')
|
||||||
|
'exact'
|
||||||
|
>>> match_type('foo', 'bar') is None
|
||||||
|
True
|
||||||
|
>>> match_type('bar', 'foo bar baz')
|
||||||
|
'exact'
|
||||||
|
>>> match_type('clean coal technology', 'foo clean coal technologies baz')
|
||||||
|
'exact'
|
||||||
|
>>> match_type('bar', 'foo Bar baz')
|
||||||
|
'exact'
|
||||||
|
>>> match_type('bar', 'foo BAR baz')
|
||||||
|
'case_mismatch'
|
||||||
|
>>> match_type('foo-bar', 'aa foo-bar cc')
|
||||||
|
'exact'
|
||||||
|
>>> match_type(u'foo\u2013bar', 'aa foo-bar cc')
|
||||||
|
'exact'
|
||||||
|
"""
|
||||||
|
q = q.replace("\u2013", "-")
|
||||||
|
snippet = tidy_snippet(snippet)
|
||||||
|
|
||||||
|
if q in snippet or case_flip_first(q) in snippet:
|
||||||
|
return "exact"
|
||||||
|
match = None
|
||||||
|
if q.lower() in snippet.lower():
|
||||||
|
match = "case_mismatch"
|
||||||
|
if match != "exact" and q.endswith("y"):
|
||||||
|
if q[:-1] in snippet or case_flip_first(q[:-1]) in snippet:
|
||||||
|
return "exact"
|
||||||
|
elif match is None:
|
||||||
|
if q[:-1].lower() in snippet.lower():
|
||||||
|
match = "case_mismatch"
|
||||||
|
return match
|
||||||
|
|
||||||
|
|
||||||
|
class NoGoodHit(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def get_best_hit(title: str, hits: list[Hit]) -> tuple[Hit, dict[str, typing.Any]]:
|
||||||
|
"""Find the best hit within the search results."""
|
||||||
|
for hit in hits:
|
||||||
|
if hit["title"].lower() == title.lower():
|
||||||
|
continue
|
||||||
|
if match_type(title, hit["snippet"]) != "exact":
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
print(f'get diff: {hit["title"]}, {title}')
|
||||||
|
found = get_diff(title, hit["title"], None)
|
||||||
|
except NoMatch:
|
||||||
|
print("no match")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return (hit, found)
|
||||||
|
|
||||||
|
raise NoGoodHit
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/<path:url_title>", methods=["GET", "POST"])
|
||||||
|
def article_page(url_title: str) -> str | Response:
|
||||||
|
"""Article page."""
|
||||||
|
from_title = url_title.replace("_", " ").strip()
|
||||||
|
|
||||||
|
if flask.request.method == "POST":
|
||||||
|
hit_title = flask.request.form["hit"]
|
||||||
|
do_save(from_title, hit_title)
|
||||||
|
return flask.redirect(
|
||||||
|
flask.url_for("article_page", url_title=url_title, after=hit_title)
|
||||||
|
)
|
||||||
|
|
||||||
|
total = search_count(from_title)
|
||||||
|
with_link = search_count_with_link(from_title)
|
||||||
|
|
||||||
|
no_link_count, hits = search_no_link(from_title)
|
||||||
|
|
||||||
|
after = flask.request.args.get("after")
|
||||||
|
if after:
|
||||||
|
print(after)
|
||||||
|
hits_iter = itertools.dropwhile(lambda hit: hit["title"] != after, hits)
|
||||||
|
skip = next(hits_iter, None)
|
||||||
|
if skip:
|
||||||
|
hits = list(hits_iter)
|
||||||
|
|
||||||
|
try:
|
||||||
|
hit, found = get_best_hit(from_title, hits)
|
||||||
|
except NoGoodHit:
|
||||||
|
return flask.render_template("all_done.html")
|
||||||
|
|
||||||
|
return flask.render_template(
|
||||||
|
"article2.html",
|
||||||
|
title=from_title,
|
||||||
|
total=total,
|
||||||
|
with_link=with_link,
|
||||||
|
hit=hit,
|
||||||
|
replacement=found["replacement"],
|
||||||
|
diff=found["diff"],
|
||||||
|
found=found,
|
||||||
|
url_title=url_title,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def do_save(title: str, hit_title: str) -> str:
|
||||||
|
"""Update page on Wikipedia."""
|
||||||
|
token = wikidata_oauth.get_token()
|
||||||
|
|
||||||
|
found = get_match(title, hit_title, None)
|
||||||
|
|
||||||
|
summary = (
|
||||||
|
f"link [[{found['replacement']}]] using [[:en:User:Edward/Find link|Find link]]"
|
||||||
|
)
|
||||||
|
|
||||||
|
edit = mediawiki_api.edit_page(
|
||||||
|
pageid=found["pageid"],
|
||||||
|
section=found["section_num"],
|
||||||
|
text=found["section_text"],
|
||||||
|
summary=summary,
|
||||||
|
baserevid=found["revid"],
|
||||||
|
token=token,
|
||||||
|
)
|
||||||
|
|
||||||
|
return edit
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/saved")
|
||||||
|
def save_done() -> str:
|
||||||
|
"""Save complete."""
|
||||||
|
return flask.render_template("save_done.html")
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/1/hits")
|
||||||
|
def api_hits() -> werkzeug.wrappers.response.Response:
|
||||||
|
"""Return canidates for the given article title."""
|
||||||
|
title = flask.request.args.get("title")
|
||||||
|
assert title
|
||||||
|
ret = core.do_search(title)
|
||||||
|
return flask.jsonify(title=title, hits=ret["results"])
|
||||||
|
|
||||||
|
# mock_hits: list[Hit] = json.load(open("sample.json"))
|
||||||
|
# return flask.jsonify(title=title, hits=mock_hits)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/api/1/valid_hit")
|
||||||
|
def api_valid_hit() -> werkzeug.wrappers.response.Response:
|
||||||
|
"""Return canidates for the given article title."""
|
||||||
|
link_from = flask.request.args.get("link_from")
|
||||||
|
link_to = flask.request.args.get("link_to")
|
||||||
|
|
||||||
|
try:
|
||||||
|
diff, replacement = get_diff(link_to, link_from, None)
|
||||||
|
except NoMatch:
|
||||||
|
return flask.jsonify(valid=False)
|
||||||
|
|
||||||
|
return flask.jsonify(valid=True, diff=diff, replacement=replacement)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/favicon.ico")
|
||||||
|
def favicon() -> None:
|
||||||
|
flask.abort(404)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run(host="0.0.0.0", port=8000)
|
Loading…
Reference in a new issue