Initial commit
This commit is contained in:
commit
f07b407e7a
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
|
@ -0,0 +1,4 @@
|
|||
__pycache__
|
||||
.mypy_cache/
|
||||
node_modules
|
||||
package-lock.json
|
22
add_front_end_libraries.py
Executable file
22
add_front_end_libraries.py
Executable file
|
@ -0,0 +1,22 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
STATIC_DIR = "static"
|
||||
|
||||
assert os.path.exists("package.json") and os.path.exists("node_modules")
|
||||
|
||||
if not os.path.exists(STATIC_DIR):
|
||||
os.mkdir(STATIC_DIR)
|
||||
|
||||
shutil.copytree(
|
||||
"node_modules/bootstrap/dist/",
|
||||
os.path.join(STATIC_DIR, "bootstrap"),
|
||||
dirs_exist_ok=True,
|
||||
)
|
||||
|
||||
subprocess.run(["npm", "run", "build"], check=True)
|
||||
|
||||
shutil.copy("dist/add_links.es.js", "static")
|
0
add_links/__init__.py
Normal file
0
add_links/__init__.py
Normal file
284
add_links/api.py
Normal file
284
add_links/api.py
Normal file
|
@ -0,0 +1,284 @@
|
|||
import re
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from simplejson.scanner import JSONDecodeError
|
||||
|
||||
from .language import get_current_language
|
||||
from .util import is_disambig
|
||||
|
||||
ua = (
|
||||
"find-link/2.2 "
|
||||
+ "(https://github.com/EdwardBetts/find_link; contact: edward@4angle.com)"
|
||||
)
|
||||
re_disambig = re.compile(r"^(.*) \((.*)\)$")
|
||||
|
||||
|
||||
def get_query_url() -> str:
|
||||
"""Get the wikipedia query API for the current language."""
|
||||
return f"https://{get_current_language()}.wikipedia.org/w/api.php"
|
||||
|
||||
|
||||
sessions = {}
|
||||
|
||||
|
||||
def get_session():
|
||||
lang = get_current_language()
|
||||
if lang in sessions:
|
||||
return sessions[lang]
|
||||
s = requests.Session()
|
||||
s.headers = {"User-Agent": ua}
|
||||
s.mount("https://en.wikipedia.org", HTTPAdapter(max_retries=10))
|
||||
s.params = {
|
||||
"format": "json",
|
||||
"action": "query",
|
||||
"formatversion": 2,
|
||||
}
|
||||
sessions[lang] = s
|
||||
return s
|
||||
|
||||
|
||||
class MediawikiError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class MultipleRedirects(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class IncompleteReply(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class MissingPage(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def check_for_error(json_data):
|
||||
if "error" in json_data:
|
||||
raise MediawikiError(json_data["error"]["info"])
|
||||
|
||||
|
||||
webpage_error = (
|
||||
"Our servers are currently under maintenance or experiencing a technical problem."
|
||||
)
|
||||
|
||||
|
||||
def api_get(params: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Make call to Wikipedia API."""
|
||||
s = get_session()
|
||||
|
||||
r = s.get(get_query_url(), params=params)
|
||||
try:
|
||||
ret = r.json()
|
||||
except JSONDecodeError:
|
||||
if webpage_error in r.text:
|
||||
raise MediawikiError(webpage_error)
|
||||
else:
|
||||
raise MediawikiError("unknown error")
|
||||
check_for_error(ret)
|
||||
return ret
|
||||
|
||||
|
||||
def get_first_page(params: dict[str, str]) -> dict[str, Any]:
|
||||
"""Run Wikipedia API query and return the first page."""
|
||||
page = api_get(params)["query"]["pages"][0]
|
||||
if page.get("missing"):
|
||||
raise MissingPage
|
||||
return page
|
||||
|
||||
|
||||
def random_article_list(limit=50):
|
||||
params = {
|
||||
"list": "random",
|
||||
"rnnamespace": "0",
|
||||
"rnlimit": limit,
|
||||
}
|
||||
|
||||
return api_get(params)["query"]["random"]
|
||||
|
||||
|
||||
def wiki_search(q):
|
||||
m = re_disambig.match(q)
|
||||
if m:
|
||||
search = '"{}" AND "{}"'.format(*m.groups())
|
||||
else:
|
||||
search = '"{}"'.format(q)
|
||||
|
||||
params = {
|
||||
"list": "search",
|
||||
"srwhat": "text",
|
||||
"srlimit": 50,
|
||||
"srsearch": search,
|
||||
"continue": "",
|
||||
}
|
||||
ret = api_get(params)
|
||||
query = ret["query"]
|
||||
totalhits = query["searchinfo"]["totalhits"]
|
||||
results = query["search"]
|
||||
for _ in range(10):
|
||||
if "continue" not in ret:
|
||||
break
|
||||
params["sroffset"] = ret["continue"]["sroffset"]
|
||||
ret = api_get(params)
|
||||
results += ret["query"]["search"]
|
||||
return (totalhits, results)
|
||||
|
||||
|
||||
def get_wiki_info(q):
|
||||
params = {
|
||||
"prop": "info",
|
||||
"redirects": "",
|
||||
"titles": q,
|
||||
}
|
||||
ret = api_get(params)["query"]
|
||||
if "interwiki" in ret:
|
||||
return None
|
||||
redirects = []
|
||||
if ret.get("redirects"):
|
||||
redirects = ret["redirects"]
|
||||
if len(redirects) != 1:
|
||||
# multiple redirects, we should explain to the user that this is
|
||||
# unsupported
|
||||
raise MultipleRedirects
|
||||
if ret["pages"][0].get("missing"):
|
||||
raise MissingPage(q)
|
||||
return redirects[0]["to"] if redirects else None
|
||||
|
||||
|
||||
def cat_start(q: str) -> list[str]:
|
||||
"""Find categories that start with this prefix."""
|
||||
params = {
|
||||
"list": "allpages",
|
||||
"apnamespace": 14, # categories
|
||||
"apfilterredir": "nonredirects",
|
||||
"aplimit": 500,
|
||||
"apprefix": q,
|
||||
}
|
||||
ret = api_get(params)["query"]
|
||||
return [i["title"] for i in ret["allpages"] if i["title"] != q]
|
||||
|
||||
|
||||
def all_pages(q: str) -> list[str]:
|
||||
"""Get all article titles with a given prefix."""
|
||||
params = {
|
||||
"list": "allpages",
|
||||
"apnamespace": 0,
|
||||
"apfilterredir": "nonredirects",
|
||||
"aplimit": 500,
|
||||
"apprefix": q,
|
||||
}
|
||||
ret = api_get(params)["query"]
|
||||
return [i["title"] for i in ret["allpages"] if i["title"] != q]
|
||||
|
||||
|
||||
def categorymembers(q: str) -> list[str]:
|
||||
"""List of category members."""
|
||||
params = {
|
||||
"list": "categorymembers",
|
||||
"cmnamespace": 0,
|
||||
"cmlimit": 500,
|
||||
"cmtitle": q[0].upper() + q[1:],
|
||||
}
|
||||
ret = api_get(params)["query"]
|
||||
return [i["title"] for i in ret["categorymembers"] if i["title"] != q]
|
||||
|
||||
|
||||
def page_links(titles): # unused
|
||||
titles = list(titles)
|
||||
assert titles
|
||||
params = {
|
||||
"prop": "links",
|
||||
"pllimit": 500,
|
||||
"plnamespace": 0,
|
||||
"titles": "|".join(titles),
|
||||
}
|
||||
ret = api_get(params)["query"]
|
||||
return dict(
|
||||
(doc["title"], {l["title"] for l in doc["links"]})
|
||||
for doc in ret["pages"].values()
|
||||
if "links" in doc
|
||||
)
|
||||
|
||||
|
||||
def find_disambig(titles: list[str]) -> list[str]:
|
||||
"""Find disambiguation articles in the given list of titles."""
|
||||
titles = list(titles)
|
||||
assert titles
|
||||
pos = 0
|
||||
disambig: list[str] = []
|
||||
params = {
|
||||
"prop": "templates",
|
||||
"tllimit": 500,
|
||||
"tlnamespace": 10, # templates
|
||||
"continue": "",
|
||||
}
|
||||
while pos < len(titles):
|
||||
params["titles"] = "|".join(titles[pos : pos + 50])
|
||||
ret = api_get(params)
|
||||
disambig.extend(
|
||||
doc["title"] for doc in ret["query"]["pages"] if is_disambig(doc)
|
||||
)
|
||||
for i in range(10):
|
||||
if "continue" not in ret:
|
||||
break
|
||||
tlcontinue = ret["continue"]["tlcontinue"]
|
||||
params["titles"] = "|".join(titles[pos : pos + 50])
|
||||
params["tlcontinue"] = tlcontinue
|
||||
ret = api_get(params)
|
||||
disambig.extend(
|
||||
doc["title"] for doc in ret["query"]["pages"] if is_disambig(doc)
|
||||
)
|
||||
pos += 50
|
||||
|
||||
return disambig
|
||||
|
||||
|
||||
def wiki_redirects(q): # pages that link here
|
||||
params = {
|
||||
"list": "backlinks",
|
||||
"blfilterredir": "redirects",
|
||||
"bllimit": 500,
|
||||
"blnamespace": 0,
|
||||
"bltitle": q,
|
||||
}
|
||||
docs = api_get(params)["query"]["backlinks"]
|
||||
assert all("redirect" in doc for doc in docs)
|
||||
return (doc["title"] for doc in docs)
|
||||
|
||||
|
||||
def wiki_backlink(q: str) -> tuple[set[str], set[str]]:
|
||||
"""Get backlinks for article."""
|
||||
params = {
|
||||
"list": "backlinks",
|
||||
"bllimit": 500,
|
||||
"blnamespace": 0,
|
||||
"bltitle": q,
|
||||
"continue": "",
|
||||
}
|
||||
ret = api_get(params)
|
||||
docs = ret["query"]["backlinks"]
|
||||
while "continue" in ret:
|
||||
params["blcontinue"] = ret["continue"]["blcontinue"]
|
||||
ret = api_get(params)
|
||||
docs += ret["query"]["backlinks"]
|
||||
|
||||
articles = {doc["title"] for doc in docs if "redirect" not in doc}
|
||||
redirects = {doc["title"] for doc in docs if "redirect" in doc}
|
||||
return (articles, redirects)
|
||||
|
||||
|
||||
def call_get_diff(title, section_num, section_text):
|
||||
data = {
|
||||
"prop": "revisions",
|
||||
"rvprop": "timestamp",
|
||||
"titles": title,
|
||||
"rvsection": section_num,
|
||||
"rvdifftotext": section_text.strip(),
|
||||
}
|
||||
|
||||
s = get_session()
|
||||
ret = s.post(get_query_url(), data=data).json()
|
||||
check_for_error(ret)
|
||||
return ret["query"]["pages"][0]["revisions"][0]["diff"]["body"]
|
198
add_links/core.py
Normal file
198
add_links/core.py
Normal file
|
@ -0,0 +1,198 @@
|
|||
"""Core functions."""
|
||||
|
||||
import html
|
||||
import re
|
||||
import typing
|
||||
from pprint import pprint
|
||||
|
||||
from .api import (
|
||||
MediawikiError,
|
||||
all_pages,
|
||||
cat_start,
|
||||
categorymembers,
|
||||
find_disambig,
|
||||
get_first_page,
|
||||
wiki_backlink,
|
||||
wiki_search,
|
||||
)
|
||||
from .util import case_flip_first, norm
|
||||
|
||||
re_redirect = re.compile(r"#REDIRECT \[\[(.)([^#]*?)(#.*)?\]\]")
|
||||
|
||||
|
||||
def get_content_and_timestamp(title: str) -> tuple[str, str]:
|
||||
"""Get article content and timestamp of last update."""
|
||||
params = {
|
||||
"prop": "revisions|info",
|
||||
"rvprop": "content|timestamp",
|
||||
"titles": title,
|
||||
}
|
||||
json_data: dict[str, typing.Any] = get_first_page(params)
|
||||
if json_data.get("invalid"):
|
||||
raise MediawikiError(json_data["invalidreason"])
|
||||
rev = json_data["revisions"][0]
|
||||
return (rev["content"], rev["timestamp"])
|
||||
|
||||
|
||||
def get_revision_info(title: str) -> dict[str, typing.Any]:
|
||||
"""Get info about latest revision of article."""
|
||||
params = {
|
||||
"prop": "revisions|info",
|
||||
"rvprop": "content|timestamp|ids",
|
||||
"titles": title,
|
||||
}
|
||||
json_data: dict[str, typing.Any] = get_first_page(params)
|
||||
if json_data.get("invalid"):
|
||||
raise MediawikiError(json_data["invalidreason"])
|
||||
revs = json_data.pop("revisions")
|
||||
ret = revs[0]
|
||||
ret["pageid"] = json_data["pageid"]
|
||||
pprint(json_data)
|
||||
return typing.cast(dict[str, typing.Any], ret)
|
||||
|
||||
|
||||
def is_redirect_to(title_from: str, title_to: str) -> bool:
|
||||
title_from = title_from.replace("_", " ")
|
||||
params = {"prop": "info", "titles": title_from}
|
||||
if "redirect" not in get_first_page(params):
|
||||
return False
|
||||
|
||||
params = {"prop": "revisions", "rvprop": "content", "titles": title_from}
|
||||
page_text = get_first_page(params)["revisions"][0]["content"]
|
||||
m = re_redirect.match(page_text)
|
||||
assert m
|
||||
title_to = title_to[0].upper() + title_to[1:]
|
||||
return m.group(1).upper() + m.group(2) == title_to
|
||||
|
||||
|
||||
def find_longer(
|
||||
q: str, search: list[dict[str, typing.Any]], articles: set[str]
|
||||
) -> list[str]:
|
||||
"""Find other articles with titles that are longer."""
|
||||
this_title = q[0].upper() + q[1:]
|
||||
longer: list[str] = all_pages(this_title)
|
||||
lq = q.lower()
|
||||
for doc in search:
|
||||
lt = doc["title"].lower()
|
||||
if lq == lt or lq not in lt:
|
||||
continue
|
||||
articles.add(doc["title"])
|
||||
more_articles, more_redirects = wiki_backlink(doc["title"])
|
||||
articles.update(more_articles)
|
||||
if doc["title"] not in longer:
|
||||
longer.append(doc["title"])
|
||||
|
||||
return longer
|
||||
|
||||
|
||||
def tidy_snippet(snippet: str) -> str:
|
||||
"""Remove HTML from snippet."""
|
||||
snippet = snippet.replace("\u2013", "-")
|
||||
snippet = snippet.replace("</span>", "")
|
||||
snippet = snippet.replace('<span class="searchmatch">', "")
|
||||
return html.unescape(snippet)
|
||||
|
||||
|
||||
def match_type(q: str, snippet: str) -> str | None:
|
||||
"""Discover match type, ''exact', 'case_mismatch' or None.
|
||||
|
||||
>>> match_type('foo', 'foo')
|
||||
'exact'
|
||||
>>> match_type('foo', 'bar') is None
|
||||
True
|
||||
>>> match_type('bar', 'foo bar baz')
|
||||
'exact'
|
||||
>>> match_type('clean coal technology', 'foo clean coal technologies baz')
|
||||
'exact'
|
||||
>>> match_type('bar', 'foo Bar baz')
|
||||
'exact'
|
||||
>>> match_type('bar', 'foo BAR baz')
|
||||
'case_mismatch'
|
||||
>>> match_type('foo-bar', 'aa foo-bar cc')
|
||||
'exact'
|
||||
>>> match_type(u'foo\u2013bar', 'aa foo-bar cc')
|
||||
'exact'
|
||||
"""
|
||||
q = q.replace("\u2013", "-")
|
||||
snippet = tidy_snippet(snippet)
|
||||
|
||||
if q in snippet or case_flip_first(q) in snippet:
|
||||
return "exact"
|
||||
match = None
|
||||
if q.lower() in snippet.lower():
|
||||
match = "case_mismatch"
|
||||
if match != "exact" and q.endswith("y"):
|
||||
if q[:-1] in snippet or case_flip_first(q[:-1]) in snippet:
|
||||
return "exact"
|
||||
elif match is None:
|
||||
if q[:-1].lower() in snippet.lower():
|
||||
match = "case_mismatch"
|
||||
return match
|
||||
|
||||
|
||||
def do_search(
|
||||
q: str, redirect_to: str | None = None
|
||||
) -> dict[str, int | list[dict[str, typing.Any]] | list[str] | None]:
|
||||
this_title = q[0].upper() + q[1:]
|
||||
|
||||
totalhits, search_hits = wiki_search(q)
|
||||
articles, redirects = wiki_backlink(redirect_to or q)
|
||||
cm = set()
|
||||
start = cat_start(q)
|
||||
if len(start) > 5:
|
||||
start = [] # big categories take too long
|
||||
for cat in set(["Category:" + this_title] + start):
|
||||
cm.update(categorymembers(cat))
|
||||
|
||||
norm_q = norm(q)
|
||||
norm_match_redirect = {r for r in redirects if norm(r) == norm_q}
|
||||
longer_redirect = {r for r in redirects if q.lower() in r.lower()}
|
||||
|
||||
articles.add(this_title)
|
||||
if redirect_to:
|
||||
articles.add(redirect_to[0].upper() + redirect_to[1:])
|
||||
|
||||
longer_redirect = {r for r in redirects if q.lower() in r.lower()}
|
||||
for r in norm_match_redirect | longer_redirect:
|
||||
articles.add(r)
|
||||
a2, r2 = wiki_backlink(r)
|
||||
articles.update(a2)
|
||||
redirects.update(r2)
|
||||
|
||||
longer = find_longer(q, search_hits, articles) if len(q) > 6 else None
|
||||
|
||||
search: list[dict[str, typing.Any]] = [
|
||||
doc
|
||||
for doc in search_hits
|
||||
if doc["title"] not in articles and doc["title"] not in cm
|
||||
]
|
||||
if search:
|
||||
disambig = set(find_disambig([doc["title"] for doc in search]))
|
||||
search = [doc for doc in search if doc["title"] not in disambig]
|
||||
# and (doc['title'] not in links or this_title not in links[doc['title']])]
|
||||
for doc in search:
|
||||
without_markup = (
|
||||
doc["snippet"]
|
||||
.replace("<span class='searchmatch'>", "")
|
||||
.replace("</span>", "")
|
||||
.replace(" ", " ")
|
||||
)
|
||||
doc["match"] = match_type(q, without_markup)
|
||||
doc["snippet_without_markup"] = without_markup
|
||||
return {
|
||||
"totalhits": totalhits,
|
||||
"results": search,
|
||||
"longer": longer,
|
||||
}
|
||||
|
||||
|
||||
def get_case_from_content(title: str) -> str | None:
|
||||
"""Check article content to find the case of the article title."""
|
||||
content, timestamp = get_content_and_timestamp(title)
|
||||
if title == title.lower() and title in content:
|
||||
return title
|
||||
start = content.lower().find("'''" + title.replace("_", " ").lower() + "'''")
|
||||
if start != -1:
|
||||
return content[start + 3 : start + 3 + len(title)]
|
||||
|
||||
return None # article doesn't contain the title
|
146
add_links/language.py
Normal file
146
add_links/language.py
Normal file
|
@ -0,0 +1,146 @@
|
|||
from flask import session, has_request_context
|
||||
|
||||
langs = [
|
||||
('af', 'Afrikaans', 'Afrikaans'),
|
||||
('als', 'Alemannisch', 'Alemannic'),
|
||||
('am', 'አማርኛ', 'Amharic'),
|
||||
('an', 'aragonés', 'Aragonese'),
|
||||
('ar', 'العربية', 'Arabic'),
|
||||
('arz', 'مصرى', 'Egyptian Arabic'),
|
||||
('ast', 'asturianu', 'Asturian'),
|
||||
('az', 'azərbaycanca', 'Azerbaijani'),
|
||||
('azb', 'تۆرکجه', 'Southern Azerbaijani'),
|
||||
('ba', 'башҡортса', 'Bashkir'),
|
||||
('bar', 'Boarisch', 'Bavarian'),
|
||||
('bat-smg', 'žemaitėška', 'Samogitian'),
|
||||
('be', 'беларуская', 'Belarusian'),
|
||||
('be-tarask', 'беларуская (тарашкевіца)', 'Belarusian (Taraškievica)'),
|
||||
('bg', 'български', 'Bulgarian'),
|
||||
('bn', 'বাংলা', 'Bengali'),
|
||||
('bpy', 'বিষ্ণুপ্রিয়া মণিপুরী', 'Bishnupriya Manipuri'),
|
||||
('br', 'brezhoneg', 'Breton'),
|
||||
('bs', 'bosanski', 'Bosnian'),
|
||||
('bug', 'ᨅᨔ ᨕᨘᨁᨗ', 'Buginese'),
|
||||
('ca', 'català', 'Catalan'),
|
||||
('ce', 'нохчийн', 'Chechen'),
|
||||
('ceb', 'Cebuano', 'Cebuano'),
|
||||
('ckb', 'کوردیی ناوەندی', 'Kurdish (Sorani)'),
|
||||
('cs', 'čeština', 'Czech'),
|
||||
('cv', 'Чӑвашла', 'Chuvash'),
|
||||
('cy', 'Cymraeg', 'Welsh'),
|
||||
('da', 'dansk', 'Danish'),
|
||||
('de', 'Deutsch', 'German'),
|
||||
('el', 'Ελληνικά', 'Greek'),
|
||||
('en', 'English', 'English'),
|
||||
('eo', 'Esperanto', 'Esperanto'),
|
||||
('es', 'español', 'Spanish'),
|
||||
('et', 'eesti', 'Estonian'),
|
||||
('eu', 'euskara', 'Basque'),
|
||||
('fa', 'فارسی', 'Persian'),
|
||||
('fi', 'suomi', 'Finnish'),
|
||||
('fo', 'føroyskt', 'Faroese'),
|
||||
('fr', 'français', 'French'),
|
||||
('fy', 'Frysk', 'West Frisian'),
|
||||
('ga', 'Gaeilge', 'Irish'),
|
||||
('gd', 'Gàidhlig', 'Scottish Gaelic'),
|
||||
('gl', 'galego', 'Galician'),
|
||||
('gu', 'ગુજરાતી', 'Gujarati'),
|
||||
('he', 'עברית', 'Hebrew'),
|
||||
('hi', 'हिन्दी', 'Hindi'),
|
||||
('hr', 'hrvatski', 'Croatian'),
|
||||
('hsb', 'hornjoserbsce', 'Upper Sorbian'),
|
||||
('ht', 'Kreyòl ayisyen', 'Haitian'),
|
||||
('hu', 'magyar', 'Hungarian'),
|
||||
('hy', 'Հայերեն', 'Armenian'),
|
||||
('ia', 'interlingua', 'Interlingua'),
|
||||
('id', 'Bahasa Indonesia', 'Indonesian'),
|
||||
('io', 'Ido', 'Ido'),
|
||||
('is', 'íslenska', 'Icelandic'),
|
||||
('it', 'italiano', 'Italian'),
|
||||
('ja', '日本語', 'Japanese'),
|
||||
('jv', 'Basa Jawa', 'Javanese'),
|
||||
('ka', 'ქართული', 'Georgian'),
|
||||
('kk', 'қазақша', 'Kazakh'),
|
||||
('kn', 'ಕನ್ನಡ', 'Kannada'),
|
||||
('ko', '한국어', 'Korean'),
|
||||
('ku', 'Kurdî', 'Kurdish (Kurmanji)'),
|
||||
('ky', 'Кыргызча', 'Kirghiz'),
|
||||
('la', 'Latina', 'Latin'),
|
||||
('lb', 'Lëtzebuergesch', 'Luxembourgish'),
|
||||
('li', 'Limburgs', 'Limburgish'),
|
||||
('lmo', 'lumbaart', 'Lombard'),
|
||||
('lt', 'lietuvių', 'Lithuanian'),
|
||||
('lv', 'latviešu', 'Latvian'),
|
||||
('map-bms', 'Basa Banyumasan', 'Banyumasan'),
|
||||
('mg', 'Malagasy', 'Malagasy'),
|
||||
('min', 'Baso Minangkabau', 'Minangkabau'),
|
||||
('mk', 'македонски', 'Macedonian'),
|
||||
('ml', 'മലയാളം', 'Malayalam'),
|
||||
('mn', 'монгол', 'Mongolian'),
|
||||
('mr', 'मराठी', 'Marathi'),
|
||||
('mrj', 'кырык мары', 'Hill Mari'),
|
||||
('ms', 'Bahasa Melayu', 'Malay'),
|
||||
('my', 'မြန်မာဘာသာ', 'Burmese'),
|
||||
('mzn', 'مازِرونی', 'Mazandarani'),
|
||||
('nah', 'Nāhuatl', 'Nahuatl'),
|
||||
('nap', 'Napulitano', 'Neapolitan'),
|
||||
('nds', 'Plattdüütsch', 'Low Saxon'),
|
||||
('ne', 'नेपाली', 'Nepali'),
|
||||
('new', 'नेपाल भाषा', 'Newar'),
|
||||
('nl', 'Nederlands', 'Dutch'),
|
||||
('nn', 'norsk nynorsk', 'Norwegian (Nynorsk)'),
|
||||
('no', 'norsk bokmål', 'Norwegian (Bokmål)'),
|
||||
('oc', 'occitan', 'Occitan'),
|
||||
('or', 'ଓଡ଼ିଆ', 'Oriya'),
|
||||
('os', 'Ирон', 'Ossetian'),
|
||||
('pa', 'ਪੰਜਾਬੀ', 'Eastern Punjabi'),
|
||||
('pl', 'polski', 'Polish'),
|
||||
('pms', 'Piemontèis', 'Piedmontese'),
|
||||
('pnb', 'پنجابی', 'Western Punjabi'),
|
||||
('pt', 'português', 'Portuguese'),
|
||||
('qu', 'Runa Simi', 'Quechua'),
|
||||
('ro', 'română', 'Romanian'),
|
||||
('ru', 'русский', 'Russian'),
|
||||
('sa', 'संस्कृतम्', 'Sanskrit'),
|
||||
('sah', 'саха тыла', 'Sakha'),
|
||||
('scn', 'sicilianu', 'Sicilian'),
|
||||
('sco', 'Scots', 'Scots'),
|
||||
('sh', 'srpskohrvatski / српскохрватски', 'Serbo-Croatian'),
|
||||
('si', 'සිංහල', 'Sinhalese'),
|
||||
('simple', 'Simple English', 'Simple English'),
|
||||
('sk', 'slovenčina', 'Slovak'),
|
||||
('sl', 'slovenščina', 'Slovenian'),
|
||||
('sq', 'shqip', 'Albanian'),
|
||||
('sr', 'српски / srpski', 'Serbian'),
|
||||
('su', 'Basa Sunda', 'Sundanese'),
|
||||
('sv', 'svenska', 'Swedish'),
|
||||
('sw', 'Kiswahili', 'Swahili'),
|
||||
('ta', 'தமிழ்', 'Tamil'),
|
||||
('te', 'తెలుగు', 'Telugu'),
|
||||
('tg', 'тоҷикӣ', 'Tajik'),
|
||||
('th', 'ไทย', 'Thai'),
|
||||
('tl', 'Tagalog', 'Tagalog'),
|
||||
('tr', 'Türkçe', 'Turkish'),
|
||||
('tt', 'татарча/tatarça', 'Tatar'),
|
||||
('uk', 'українська', 'Ukrainian'),
|
||||
('ur', 'اردو', 'Urdu'),
|
||||
('uz', 'oʻzbekcha/ўзбекча', 'Uzbek'),
|
||||
('vec', 'vèneto', 'Venetian'),
|
||||
('vi', 'Tiếng Việt', 'Vietnamese'),
|
||||
('vo', 'Volapük', 'Volapük'),
|
||||
('wa', 'walon', 'Walloon'),
|
||||
('war', 'Winaray', 'Waray'),
|
||||
('yi', 'ייִדיש', 'Yiddish'),
|
||||
('yo', 'Yorùbá', 'Yoruba'),
|
||||
('zh', '中文', 'Chinese'),
|
||||
('zh-min-nan', 'Bân-lâm-gú', 'Min Nan'),
|
||||
('zh-yue', '粵語', 'Cantonese'),
|
||||
]
|
||||
|
||||
def get_langs() -> list[dict[str, str]]:
|
||||
"""List of all known languages."""
|
||||
return [dict(zip(('code', 'local', 'english'), l)) for l in langs]
|
||||
|
||||
def get_current_language() -> str:
|
||||
"""Return ISO-3166 language code for the current language."""
|
||||
return session.get('current_lang', 'en') if has_request_context() else 'en'
|
381
add_links/match.py
Normal file
381
add_links/match.py
Normal file
|
@ -0,0 +1,381 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
import typing
|
||||
|
||||
from .api import MissingPage, call_get_diff, get_wiki_info
|
||||
from .core import get_case_from_content, get_content_and_timestamp, get_revision_info
|
||||
from .util import is_title_case, lc_alpha
|
||||
|
||||
re_link_in_text = re.compile(r"\[\[[^]]+?\]\]", re.I | re.S)
|
||||
|
||||
|
||||
class LinkReplace(Exception):
|
||||
pass
|
||||
|
||||
|
||||
en_dash = "\u2013"
|
||||
trans = {",": ",?", " ": " *[-\n]? *"}
|
||||
trans[en_dash] = trans[" "]
|
||||
|
||||
trans2 = {" ": r"('?s?\]\])?'?s? ?(\[\[(?:.+\|)?)?", "-": "[- ]"}
|
||||
trans2[en_dash] = trans2[" "]
|
||||
|
||||
patterns = [
|
||||
lambda q: re.compile(
|
||||
r"(?<!-)(?:\[\[(?:[^]]+\|)?)?(%s)%s(?:\]\])?"
|
||||
% (
|
||||
re.escape(q[0]),
|
||||
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
|
||||
),
|
||||
re.I,
|
||||
),
|
||||
lambda q: re.compile(
|
||||
r"(?<!-)\[\[[^|]+\|(%s)%s\]\]" % (re.escape(q[0]), re.escape(q[1:])), re.I
|
||||
),
|
||||
lambda q: re.compile(
|
||||
r"(?<!-)\[\[[^|]+\|(%s)%s(?:\]\])?"
|
||||
% (
|
||||
re.escape(q[0]),
|
||||
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
|
||||
),
|
||||
re.I,
|
||||
),
|
||||
lambda q: re.compile(r"(?<!-)(%s)%s" % (re.escape(q[0]), re.escape(q[1:])), re.I),
|
||||
lambda q: re.compile(
|
||||
r"(?<!-)(%s)%s"
|
||||
% (
|
||||
re.escape(q[0]),
|
||||
"".join((trans[c] if c in trans else re.escape(c)) for c in q[1:]),
|
||||
),
|
||||
re.I,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class NoMatch(Exception):
|
||||
pass
|
||||
|
||||
|
||||
re_cite = re.compile(
|
||||
r"<ref( [^>]*?)?>\s*({{cite.*?}}|\[https?://[^]]*?\])\s*</ref>", re.I | re.S
|
||||
)
|
||||
|
||||
|
||||
def parse_cite(text: str) -> typing.Iterator[tuple[str, str]]:
|
||||
"""Parse a citation template."""
|
||||
prev = 0
|
||||
for m in re_cite.finditer(text):
|
||||
yield ("text", text[prev : m.start()])
|
||||
yield ("cite", m.group(0))
|
||||
prev = m.end()
|
||||
yield ("text", text[prev:])
|
||||
|
||||
|
||||
re_heading = re.compile(r"^\s*(=+)\s*(.+)\s*\1(<!--.*-->|\s)*$")
|
||||
|
||||
|
||||
def section_iter(text: str) -> typing.Iterator[tuple[str | None, str]]:
|
||||
"""Iterate sections yielding tuples of heading and section text."""
|
||||
cur_section = ""
|
||||
heading = None
|
||||
in_comment = False
|
||||
for line in text.splitlines(True):
|
||||
if "<!--" in line:
|
||||
in_comment = True
|
||||
if "-->" in line:
|
||||
in_comment = False
|
||||
m = re_heading.match(line)
|
||||
if in_comment or not m:
|
||||
cur_section += line
|
||||
continue
|
||||
if cur_section or heading:
|
||||
yield (heading, cur_section)
|
||||
heading = m.group()
|
||||
cur_section = ""
|
||||
continue
|
||||
yield (heading, cur_section)
|
||||
|
||||
|
||||
def get_subsections(text: str, section_num: int) -> str:
|
||||
"retrieve the text of subsections for a given section number within an article"
|
||||
found = ""
|
||||
collection_level = None
|
||||
for num, (heading, body) in enumerate(section_iter(text)):
|
||||
if heading is None:
|
||||
level = 0
|
||||
else:
|
||||
m = re_heading.match(heading)
|
||||
assert m
|
||||
level = len(m.group(1))
|
||||
if num == section_num:
|
||||
collection_level = level
|
||||
continue
|
||||
if collection_level:
|
||||
if level > collection_level:
|
||||
assert heading
|
||||
found += heading + body
|
||||
else:
|
||||
break
|
||||
return found
|
||||
|
||||
|
||||
def match_found(m, q, linkto):
|
||||
if q[1:] == m.group(0)[1:]:
|
||||
replacement = m.group(1) + q[1:]
|
||||
elif any(c.isupper() for c in q[1:]) or m.group(0) == m.group(0).upper():
|
||||
replacement = q
|
||||
elif is_title_case(m.group(0)):
|
||||
replacement = None
|
||||
replacement = get_case_from_content(q)
|
||||
if replacement is None:
|
||||
replacement = q.lower()
|
||||
else:
|
||||
replacement = m.group(1) + q[1:]
|
||||
assert replacement
|
||||
if linkto:
|
||||
if linkto[0].isupper() and replacement[0] == linkto[0].lower():
|
||||
linkto = linkto[0].lower() + linkto[1:]
|
||||
elif replacement[0].isupper():
|
||||
linkto = linkto[0].upper() + linkto[1:]
|
||||
replacement = linkto + "|" + replacement
|
||||
return replacement
|
||||
|
||||
|
||||
def parse_links(text: str) -> typing.Iterator[tuple[str, str]]:
|
||||
prev = 0
|
||||
for m in re_link_in_text.finditer(text):
|
||||
if prev != m.start():
|
||||
yield ("text", text[prev : m.start()])
|
||||
if any(
|
||||
m.group().lower().startswith("[[" + prefix)
|
||||
for prefix in ("file:", "image:")
|
||||
):
|
||||
yield ("image", m.group(0))
|
||||
else:
|
||||
yield ("link", m.group(0))
|
||||
prev = m.end()
|
||||
if prev < len(text):
|
||||
yield ("text", text[prev:])
|
||||
|
||||
|
||||
def mk_link_matcher(q):
|
||||
re_links = [p(q) for p in patterns]
|
||||
|
||||
def search_for_link(text):
|
||||
for re_link in re_links:
|
||||
m = re_link.search(text)
|
||||
if m and m.group(0).count("[[") < 4:
|
||||
return m
|
||||
|
||||
return search_for_link
|
||||
|
||||
|
||||
def add_link(m, replacement, text):
|
||||
return m.re.sub(lambda m: "[[%s]]" % replacement, text, count=1)
|
||||
|
||||
|
||||
def find_link_in_chunk(q, content, linkto=None):
|
||||
search_for_link = mk_link_matcher(q)
|
||||
new_content = ""
|
||||
replacement = None
|
||||
|
||||
match_in_non_link = False
|
||||
bad_link_match = False
|
||||
found_text_to_link = None
|
||||
|
||||
for token_type, text in parse_links(content):
|
||||
if token_type == "text":
|
||||
if search_for_link(text):
|
||||
match_in_non_link = True
|
||||
elif token_type == "image":
|
||||
before, sep, link_text = text[:-2].rpartition("|")
|
||||
m = search_for_link(link_text)
|
||||
if m:
|
||||
found_text_to_link = m.group(0)
|
||||
replacement = match_found(m, q, linkto)
|
||||
text = before + sep + add_link(m, replacement, link_text) + "]]"
|
||||
elif token_type == "link" and not replacement and not match_in_non_link:
|
||||
link_text = text[2:-2]
|
||||
link_dest = None
|
||||
if "|" in link_text:
|
||||
link_dest, link_text = link_text.split("|", 1)
|
||||
m = search_for_link(link_text)
|
||||
if m and (not link_dest or not link_dest.startswith("#")):
|
||||
lc_alpha_q = lc_alpha(q)
|
||||
|
||||
bad_link_match = (
|
||||
link_dest
|
||||
and len(link_dest) > len(q)
|
||||
and (lc_alpha_q not in lc_alpha(link_dest))
|
||||
)
|
||||
if not link_dest:
|
||||
if q in link_text and len(link_text) > len(q):
|
||||
bad_link_match = True
|
||||
if bad_link_match and link_dest:
|
||||
try:
|
||||
link_dest_redirect = get_wiki_info(link_dest)
|
||||
except MissingPage:
|
||||
link_dest_redirect = None
|
||||
if (
|
||||
link_dest_redirect
|
||||
and lc_alpha(link_dest_redirect) == lc_alpha_q
|
||||
):
|
||||
bad_link_match = False
|
||||
if not bad_link_match:
|
||||
replacement = match_found(m, q, linkto)
|
||||
found_text_to_link = m.group(0)
|
||||
text = add_link(m, replacement, link_text)
|
||||
new_content += text
|
||||
if not replacement:
|
||||
if bad_link_match:
|
||||
raise LinkReplace
|
||||
m = search_for_link(content)
|
||||
if m:
|
||||
found_text_to_link = m.group(0)
|
||||
replacement = match_found(m, q, linkto)
|
||||
new_content = add_link(m, replacement, content)
|
||||
if linkto:
|
||||
m_end = m.end()
|
||||
re_extend = re.compile(m.re.pattern + r"\w*\b", re.I)
|
||||
m = re_extend.search(content)
|
||||
if m and m.end() > m_end:
|
||||
replacement += content[m_end : m.end()]
|
||||
new_content = add_link(m, replacement, content)
|
||||
return (new_content, replacement, found_text_to_link)
|
||||
|
||||
|
||||
def find_link_in_text(q, content):
|
||||
(new_content, replacement) = find_link_in_chunk(q, content)
|
||||
if replacement:
|
||||
return (new_content, replacement)
|
||||
raise NoMatch
|
||||
|
||||
|
||||
def find_link_in_content(q, content, linkto=None):
|
||||
if linkto:
|
||||
try:
|
||||
return find_link_in_content(linkto, content)
|
||||
except NoMatch:
|
||||
pass
|
||||
replacement = None
|
||||
new_content = ""
|
||||
link_replace = False
|
||||
for header, section_text in section_iter(content):
|
||||
if header:
|
||||
new_content += header
|
||||
for token_type, text in parse_cite(section_text):
|
||||
if token_type == "text" and not replacement:
|
||||
try:
|
||||
(new_text, replacement, replaced_text) = find_link_in_chunk(
|
||||
q, text, linkto=linkto
|
||||
)
|
||||
except LinkReplace:
|
||||
link_replace = True
|
||||
if replacement:
|
||||
text = new_text
|
||||
new_content += text
|
||||
if replacement:
|
||||
return (new_content, replacement, replaced_text)
|
||||
raise LinkReplace if link_replace else NoMatch
|
||||
|
||||
|
||||
def find_link_and_section(q, content, linkto=None):
|
||||
if linkto:
|
||||
try:
|
||||
return find_link_and_section(linkto, content)
|
||||
except NoMatch:
|
||||
pass
|
||||
sections = list(section_iter(content))
|
||||
replacement = None
|
||||
|
||||
search_for_link = mk_link_matcher(q)
|
||||
|
||||
found: dict[str, str | int] = {}
|
||||
|
||||
for section_num, (header, section_text) in enumerate(sections):
|
||||
new_content = ""
|
||||
if header:
|
||||
new_content += header
|
||||
for token_type, text in parse_cite(section_text):
|
||||
if token_type == "text" and not replacement:
|
||||
new_text = ""
|
||||
for token_type2, text2 in parse_links(text):
|
||||
if token_type2 == "link" and not replacement:
|
||||
link_text = text2[2:-2]
|
||||
if "|" in link_text:
|
||||
link_dest, link_text = link_text.split("|", 1)
|
||||
else:
|
||||
link_dest = None
|
||||
m = search_for_link(link_text)
|
||||
if m:
|
||||
if link_dest:
|
||||
found["link_dest"] = link_dest
|
||||
found["link_text"] = link_text
|
||||
replacement = match_found(m, q, None)
|
||||
text2 = add_link(m, replacement, link_text)
|
||||
new_text += text2
|
||||
if replacement:
|
||||
text = new_text
|
||||
else:
|
||||
m = search_for_link(text)
|
||||
if m:
|
||||
replacement = match_found(m, q, linkto)
|
||||
text = add_link(m, replacement, text)
|
||||
new_content += text
|
||||
if replacement:
|
||||
found.update(
|
||||
{
|
||||
"section_num": section_num,
|
||||
"section_text": new_content,
|
||||
"old_text": (header or "") + section_text,
|
||||
"replacement": replacement,
|
||||
}
|
||||
)
|
||||
return found
|
||||
raise NoMatch
|
||||
|
||||
|
||||
def find_refs(text: str) -> list[str]:
|
||||
"""Find <ref> in wikitext."""
|
||||
|
||||
refs = re.findall("<ref(?:[^>]*)>(.+?)</ref>", text)
|
||||
print(refs)
|
||||
return refs
|
||||
|
||||
|
||||
def new_link_is_in_ref(replacement: str, text: str) -> bool:
|
||||
"""Is the new link in a <ref>."""
|
||||
link = f"[[{replacement}]]"
|
||||
return any(link in ref for ref in find_refs(text))
|
||||
|
||||
|
||||
def get_match(q: str, title: str, linkto: str | None) -> dict[str, typing.Any]:
|
||||
"""Get match."""
|
||||
rev = get_revision_info(title)
|
||||
|
||||
found: dict[str, typing.Any] = find_link_and_section(q, rev["content"], linkto)
|
||||
|
||||
assert not new_link_is_in_ref(found["replacement"], found["section_text"])
|
||||
|
||||
found["revid"] = rev["revid"]
|
||||
found["pageid"] = rev["pageid"]
|
||||
found["section_text"] += get_subsections(rev["content"], found["section_num"])
|
||||
|
||||
return found
|
||||
|
||||
|
||||
def get_diff(q: str, title: str, linkto: str | None) -> dict[str, typing.Any]:
|
||||
"""Get diff."""
|
||||
content, timestamp = get_content_and_timestamp(title)
|
||||
found: dict[str, typing.Any] = find_link_and_section(q, content, linkto)
|
||||
|
||||
if new_link_is_in_ref(found["replacement"], found["section_text"]):
|
||||
raise NoMatch
|
||||
|
||||
section_text = found["section_text"] + get_subsections(
|
||||
content, found["section_num"]
|
||||
)
|
||||
|
||||
found["diff"] = call_get_diff(title, found["section_num"], section_text)
|
||||
return found
|
101
add_links/mediawiki_api.py
Normal file
101
add_links/mediawiki_api.py
Normal file
|
@ -0,0 +1,101 @@
|
|||
"""Interface with the mediawiki API."""
|
||||
|
||||
import typing
|
||||
from pprint import pprint
|
||||
from typing import Any, cast
|
||||
|
||||
from . import wikidata_oauth
|
||||
|
||||
wiki_hostname = "en.wikipedia.org"
|
||||
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
|
||||
user_agent = "add-links/0.1"
|
||||
|
||||
|
||||
def parse_page(enwiki: str) -> dict[str, Any]:
|
||||
"""Call mediawiki parse API for given article."""
|
||||
params: dict[str, str | int] = {
|
||||
"action": "parse",
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"disableeditsection": 1,
|
||||
"page": enwiki,
|
||||
"prop": "text|links|headhtml",
|
||||
"disabletoc": 1,
|
||||
}
|
||||
|
||||
parse: dict[str, Any] = call(params)["parse"]
|
||||
return parse
|
||||
|
||||
|
||||
def call(params: dict[str, str | int]) -> dict[str, typing.Any]:
|
||||
"""Make GET request to mediawiki API."""
|
||||
data = wikidata_oauth.api_post_request(params)
|
||||
return cast(dict[str, Any], data.json())
|
||||
|
||||
|
||||
def article_exists(title: str) -> bool:
|
||||
"""Get article text."""
|
||||
params: dict[str, str | int] = {
|
||||
"action": "query",
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"titles": title,
|
||||
}
|
||||
return not call(params)["query"]["pages"][0].get("missing")
|
||||
|
||||
|
||||
def get_content(title: str) -> tuple[str, int]:
|
||||
"""Get article text."""
|
||||
params: dict[str, str | int] = {
|
||||
"action": "query",
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"prop": "revisions|info",
|
||||
"rvprop": "content|timestamp|ids",
|
||||
"titles": title,
|
||||
}
|
||||
data = call(params)
|
||||
rev = data["query"]["pages"][0]["revisions"][0]
|
||||
content: str = rev["content"]
|
||||
revid: int = int(rev["revid"])
|
||||
return content, revid
|
||||
|
||||
|
||||
def compare(title: str, new_text: str) -> str:
|
||||
"""Generate a diff for the new article text."""
|
||||
params: dict[str, str | int] = {
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"action": "compare",
|
||||
"fromtitle": title,
|
||||
"toslots": "main",
|
||||
"totext-main": new_text,
|
||||
"prop": "diff",
|
||||
}
|
||||
diff: str = call(params)["compare"]["body"]
|
||||
return diff
|
||||
|
||||
|
||||
def edit_page(
|
||||
pageid: int, section: str | int, text: str, summary: str, baserevid: str, token: str
|
||||
) -> str:
|
||||
"""Edit a page on Wikipedia."""
|
||||
params: dict[str, str | int] = {
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"action": "edit",
|
||||
"pageid": pageid,
|
||||
"text": text,
|
||||
"baserevid": baserevid,
|
||||
"token": token,
|
||||
"nocreate": 1,
|
||||
"summary": summary,
|
||||
"section": section,
|
||||
}
|
||||
ret = call(params)
|
||||
if "edit" not in ret:
|
||||
print("params")
|
||||
pprint(params)
|
||||
print()
|
||||
pprint(ret)
|
||||
return typing.cast(str, ret["edit"])
|
48
add_links/mediawiki_api_old.py
Normal file
48
add_links/mediawiki_api_old.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
"""Interface with the mediawiki API."""
|
||||
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
|
||||
wiki_hostname = "en.wikipedia.org"
|
||||
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
|
||||
user_agent = "dab-mechanic/0.1"
|
||||
|
||||
|
||||
def parse_page(enwiki: str) -> dict[str, Any]:
|
||||
"""Call mediawiki parse API for given article."""
|
||||
params: dict[str, str | int] = {
|
||||
"action": "parse",
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"disableeditsection": 1,
|
||||
"page": enwiki,
|
||||
"prop": "text|links|headhtml",
|
||||
"disabletoc": 1,
|
||||
}
|
||||
|
||||
parse: dict[str, Any] = get(params)["parse"]
|
||||
return parse
|
||||
|
||||
|
||||
def get(params: dict[str, str | int]) -> dict[str, Any]:
|
||||
"""Make GET request to mediawiki API."""
|
||||
data: dict[str, Any] = requests.get(
|
||||
wiki_api_php, headers={"User-Agent": user_agent}, params=params
|
||||
).json()
|
||||
return data
|
||||
|
||||
|
||||
def get_content(title: str) -> str:
|
||||
"""Get article text."""
|
||||
params: dict[str, str | int] = {
|
||||
"action": "query",
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"prop": "revisions|info",
|
||||
"rvprop": "content|timestamp",
|
||||
"titles": title,
|
||||
}
|
||||
data = get(params)
|
||||
rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
|
||||
return rev
|
115
add_links/util.py
Normal file
115
add_links/util.py
Normal file
|
@ -0,0 +1,115 @@
|
|||
"""Util functions."""
|
||||
|
||||
import re
|
||||
import urllib
|
||||
from typing import Any
|
||||
|
||||
# util functions that don't access the network
|
||||
|
||||
namespaces = {
|
||||
ns.casefold()
|
||||
for ns in (
|
||||
"Special",
|
||||
"Media",
|
||||
"Talk",
|
||||
"Template",
|
||||
"Portal",
|
||||
"Portal talk",
|
||||
"Book",
|
||||
"Book talk",
|
||||
"Template talk",
|
||||
"Draft",
|
||||
"Draft talk",
|
||||
"Help",
|
||||
"Help talk",
|
||||
"Category",
|
||||
"Category talk",
|
||||
"User",
|
||||
"Gadget",
|
||||
"Gadget talk",
|
||||
"Gadget definition",
|
||||
"Gadget definition talk",
|
||||
"Topic",
|
||||
"User talk",
|
||||
"Wikipedia",
|
||||
"Education Program",
|
||||
"Education Program talk",
|
||||
"Wikipedia talk",
|
||||
"File",
|
||||
"File talk",
|
||||
"TimedText",
|
||||
"TimedText talk",
|
||||
"MediaWiki",
|
||||
"Module",
|
||||
"Module talk",
|
||||
"MediaWiki talk",
|
||||
)
|
||||
}
|
||||
|
||||
re_space_or_dash = re.compile("[ -]")
|
||||
|
||||
|
||||
def is_title_case(phrase: str) -> bool:
|
||||
"""Is a given phrase is in Title Case."""
|
||||
return all(
|
||||
term[0].isupper() and term[1:].islower()
|
||||
for term in re_space_or_dash.split(phrase)
|
||||
if term and term[0].isalpha()
|
||||
)
|
||||
|
||||
|
||||
def urlquote(value: str) -> str:
|
||||
"""Prepare string for use in URL param."""
|
||||
return urllib.parse.quote_plus(value.encode("utf-8"))
|
||||
|
||||
|
||||
def strip_parens(q: str) -> str:
|
||||
"""Remove a word in parenthesis from the end of a string."""
|
||||
m = re.search(r" \(.*?\)$", q)
|
||||
return q[: m.start()] if m else q
|
||||
|
||||
|
||||
def starts_with_namespace(title: str) -> bool:
|
||||
"""Check if a title starts with a namespace."""
|
||||
return ":" in title and title.split(":", 1)[0].casefold() in namespaces
|
||||
|
||||
|
||||
def is_disambig(doc: dict[str, Any]) -> bool:
|
||||
"""Is a this a disambiguation page."""
|
||||
return any(
|
||||
"disambig" in t
|
||||
or t.endswith("dis")
|
||||
or "given name" in t
|
||||
or t == "template:surname"
|
||||
for t in (t["title"].lower() for t in doc.get("templates", []))
|
||||
)
|
||||
|
||||
|
||||
def norm(s: str) -> str:
|
||||
"""Normalise string."""
|
||||
s = re.sub(r"\W", "", s).lower()
|
||||
return s[:-1] if s and s[-1] == "s" else s
|
||||
|
||||
|
||||
def case_flip(s: str) -> str:
|
||||
"""Switch case of character."""
|
||||
if s.islower():
|
||||
return s.upper()
|
||||
if s.isupper():
|
||||
return s.lower()
|
||||
return s
|
||||
|
||||
|
||||
def case_flip_first(s: str) -> str:
|
||||
"""Switch case of first character in string."""
|
||||
return case_flip(s[0]) + s[1:]
|
||||
|
||||
|
||||
def lc_alpha(s: str) -> str:
|
||||
"""Lower case alphabetic characters in string."""
|
||||
return "".join(c.lower() for c in s if c.isalpha())
|
||||
|
||||
|
||||
def wiki_space_norm(s: str) -> str:
|
||||
"""Normalise article title."""
|
||||
return s.replace("_", " ").strip()
|
98
add_links/wikidata_oauth.py
Normal file
98
add_links/wikidata_oauth.py
Normal file
|
@ -0,0 +1,98 @@
|
|||
import typing
|
||||
import urllib
|
||||
from typing import cast
|
||||
|
||||
from flask import current_app, session
|
||||
from requests_oauthlib import OAuth1Session
|
||||
|
||||
wiki_hostname = "en.wikipedia.org"
|
||||
api_url = f"https://{wiki_hostname}/w/api.php"
|
||||
|
||||
|
||||
def get_edit_proxy() -> dict[str, str]:
|
||||
"""Retrieve proxy information from config."""
|
||||
edit_proxy = current_app.config.get("EDIT_PROXY")
|
||||
if edit_proxy:
|
||||
return {"http": edit_proxy, "https": edit_proxy}
|
||||
else:
|
||||
return {}
|
||||
|
||||
|
||||
def api_post_request(params: dict[str, str | int]):
|
||||
"""HTTP Post using Oauth."""
|
||||
app = current_app
|
||||
# url = "https://www.wikidata.org/w/api.php"
|
||||
client_key = app.config["CLIENT_KEY"]
|
||||
client_secret = app.config["CLIENT_SECRET"]
|
||||
oauth = OAuth1Session(
|
||||
client_key,
|
||||
client_secret=client_secret,
|
||||
resource_owner_key=session["owner_key"],
|
||||
resource_owner_secret=session["owner_secret"],
|
||||
)
|
||||
proxies = get_edit_proxy()
|
||||
return oauth.post(api_url, data=params, timeout=4, proxies=proxies)
|
||||
|
||||
|
||||
def raw_request(params: typing.Mapping[str, str | int]):
|
||||
"""Low-level API request."""
|
||||
app = current_app
|
||||
# url = "https://www.wikidata.org/w/api.php?" + urlencode(params)
|
||||
client_key = app.config["CLIENT_KEY"]
|
||||
client_secret = app.config["CLIENT_SECRET"]
|
||||
oauth = OAuth1Session(
|
||||
client_key,
|
||||
client_secret=client_secret,
|
||||
resource_owner_key=session["owner_key"],
|
||||
resource_owner_secret=session["owner_secret"],
|
||||
)
|
||||
proxies = get_edit_proxy()
|
||||
return oauth.get(
|
||||
api_url + "?" + urllib.parse.urlencode(params), timeout=4, proxies=proxies
|
||||
)
|
||||
|
||||
|
||||
def api_request(params: typing.Mapping[str, str | int]) -> dict[str, typing.Any]:
|
||||
"""Make an API request with OAuth."""
|
||||
r = raw_request(params)
|
||||
try:
|
||||
return cast(dict[str, typing.Any], r.json())
|
||||
except Exception:
|
||||
print("text")
|
||||
print(r.text)
|
||||
print("---")
|
||||
raise
|
||||
|
||||
|
||||
def get_token() -> str:
|
||||
"""Get CSRF tokebn from MediaWiki API."""
|
||||
params: dict[str, str | int] = {
|
||||
"action": "query",
|
||||
"meta": "tokens",
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
}
|
||||
reply = api_request(params)
|
||||
token: str = reply["query"]["tokens"]["csrftoken"]
|
||||
|
||||
return token
|
||||
|
||||
|
||||
def userinfo_call() -> typing.Mapping[str, typing.Any]:
|
||||
"""Request user information via OAuth."""
|
||||
params = {"action": "query", "meta": "userinfo", "format": "json"}
|
||||
return api_request(params)
|
||||
|
||||
|
||||
def get_username() -> None | str:
|
||||
"""Get the username or None if not logged in."""
|
||||
if "owner_key" not in session:
|
||||
return None # not authorized
|
||||
|
||||
if "username" not in session:
|
||||
reply = userinfo_call()
|
||||
if "query" not in reply:
|
||||
return None
|
||||
session["username"] = reply["query"]["userinfo"]["name"]
|
||||
|
||||
return cast(str, session["username"])
|
201
add_links/wikipedia.py
Normal file
201
add_links/wikipedia.py
Normal file
|
@ -0,0 +1,201 @@
|
|||
from collections import defaultdict
|
||||
from typing import Any, Iterator, Optional, TypedDict
|
||||
|
||||
import flask
|
||||
import lxml.html
|
||||
|
||||
from . import mediawiki_api
|
||||
|
||||
disambig_templates = [
|
||||
"Template:Disambiguation",
|
||||
"Template:Airport disambiguation",
|
||||
"Template:Biology disambiguation",
|
||||
"Template:Call sign disambiguation",
|
||||
"Template:Caselaw disambiguation",
|
||||
"Template:Chinese title disambiguation",
|
||||
"Template:Disambiguation cleanup",
|
||||
"Template:Genus disambiguation",
|
||||
"Template:Hospital disambiguation",
|
||||
"Template:Human name disambiguation",
|
||||
"Template:Human name disambiguation cleanup",
|
||||
"Template:Letter-number combination disambiguation",
|
||||
"Template:Mathematical disambiguation",
|
||||
"Template:Military unit disambiguation",
|
||||
"Template:Music disambiguation",
|
||||
"Template:Number disambiguation",
|
||||
"Template:Opus number disambiguation",
|
||||
"Template:Phonetics disambiguation",
|
||||
"Template:Place name disambiguation",
|
||||
"Template:Portal disambiguation",
|
||||
"Template:Road disambiguation",
|
||||
"Template:School disambiguation",
|
||||
"Template:Species Latin name abbreviation disambiguation",
|
||||
"Template:Species Latin name disambiguation",
|
||||
"Template:Station disambiguation",
|
||||
"Template:Synagogue disambiguation",
|
||||
"Template:Taxonomic authority disambiguation",
|
||||
"Template:Taxonomy disambiguation",
|
||||
"Template:Template disambiguation",
|
||||
"Template:WoO number disambiguation",
|
||||
]
|
||||
|
||||
|
||||
def link_params(enwiki: str) -> dict[str, str | int]:
|
||||
"""Parameters for finding article links from the API."""
|
||||
params: dict[str, str | int] = {
|
||||
"action": "query",
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"titles": enwiki,
|
||||
"generator": "links",
|
||||
"gpllimit": "max",
|
||||
"gplnamespace": 0,
|
||||
"tllimit": "max",
|
||||
"redirects": 1,
|
||||
"tlnamespace": 10,
|
||||
"tltemplates": "|".join(disambig_templates),
|
||||
"prop": "templates",
|
||||
}
|
||||
return params
|
||||
|
||||
|
||||
def needs_disambig(link: dict[str, Any]) -> bool:
|
||||
"""Is this a disambiguation link."""
|
||||
return bool(
|
||||
not link["title"].endswith(" (disambiguation)") and link.get("templates")
|
||||
)
|
||||
|
||||
|
||||
def get_article_links(enwiki: str) -> list[str]:
|
||||
"""Get links that appear in this article."""
|
||||
|
||||
params: dict[str, str | int] = link_params(enwiki)
|
||||
links: set[str] = set()
|
||||
|
||||
redirects = defaultdict(set)
|
||||
|
||||
while True:
|
||||
data = mediawiki_api.get(params)
|
||||
pages = data["query"].pop("pages")
|
||||
for r in data["query"].pop("redirects"):
|
||||
redirects[r["to"]].add(r["from"])
|
||||
|
||||
links.update(page["title"] for page in pages if needs_disambig(page))
|
||||
|
||||
if "continue" not in data:
|
||||
break
|
||||
|
||||
params["gplcontinue"] = data["continue"]["gplcontinue"]
|
||||
|
||||
for link in set(links):
|
||||
if link in redirects:
|
||||
links.update(redirects[link])
|
||||
|
||||
return list(links)
|
||||
|
||||
# return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
|
||||
|
||||
|
||||
def get_article_html(enwiki: str) -> str:
|
||||
"""Parse article wikitext and return HTML."""
|
||||
text: str = mediawiki_api.parse_page(enwiki)["text"]
|
||||
return text
|
||||
|
||||
|
||||
class DabItem(TypedDict):
|
||||
"""Represent a disabiguation page."""
|
||||
|
||||
num: int
|
||||
title: str
|
||||
html: str
|
||||
|
||||
|
||||
def delete_toc(root: lxml.html.HtmlElement) -> None:
|
||||
"""Delete table of contents from article HTML."""
|
||||
for toc in root.findall(".//div[@class='toc']"):
|
||||
toc.getparent().remove(toc)
|
||||
|
||||
|
||||
def get_dab_html(dab_num: int, title: str) -> str:
|
||||
"""Parse dab page and rewrite links."""
|
||||
dab_html = get_article_html(title)
|
||||
root = lxml.html.fromstring(dab_html)
|
||||
delete_toc(root)
|
||||
|
||||
element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")}
|
||||
|
||||
for a in root.findall(".//a[@href]"):
|
||||
href: str | None = a.get("href")
|
||||
if not href:
|
||||
continue
|
||||
if not href.startswith("#"):
|
||||
a.set("href", "#")
|
||||
a.set("onclick", f"return select_dab(this, {dab_num})")
|
||||
continue
|
||||
|
||||
destination_element = element_id_map[href[1:]]
|
||||
assert destination_element is not None
|
||||
destination_element.set("id", f"{dab_num}{href[1:]}")
|
||||
a.set("href", f"#{dab_num}{href[1:]}")
|
||||
|
||||
html: str = lxml.html.tostring(root, encoding=str)
|
||||
return html
|
||||
|
||||
|
||||
class Article:
|
||||
"""Current article we're working on."""
|
||||
|
||||
def __init__(self, enwiki: str) -> None:
|
||||
"""Make a new Article object."""
|
||||
self.enwiki = enwiki.replace("_", " ")
|
||||
|
||||
self.links = get_article_links(enwiki)
|
||||
|
||||
self.dab_list: list[DabItem] = []
|
||||
self.dab_lookup: dict[int, str] = {}
|
||||
self.dab_order: list[str] = []
|
||||
self.parse: Optional[dict[str, Any]] = None
|
||||
|
||||
def save_endpoint(self) -> str:
|
||||
"""Endpoint for saving changes."""
|
||||
href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_"))
|
||||
return href
|
||||
|
||||
def load(self) -> None:
|
||||
"""Load parsed article HTML."""
|
||||
self.parse = mediawiki_api.parse_page(self.enwiki)
|
||||
self.root = lxml.html.fromstring(self.parse.pop("text"))
|
||||
|
||||
def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
|
||||
"""Disambiguation links that need fixing."""
|
||||
seen = set()
|
||||
for a in self.root.findall(".//a[@href]"):
|
||||
title = a.get("title")
|
||||
if title is None or title not in self.links:
|
||||
continue
|
||||
a.set("class", "disambig")
|
||||
|
||||
if title in seen:
|
||||
continue
|
||||
seen.add(title)
|
||||
|
||||
yield a, title
|
||||
|
||||
def process_links(self) -> None:
|
||||
"""Process links in parsed wikitext."""
|
||||
for dab_num, (a, title) in enumerate(self.iter_links()):
|
||||
a.set("id", f"dab-{dab_num}")
|
||||
|
||||
dab: DabItem = {
|
||||
"num": dab_num,
|
||||
"title": title,
|
||||
"html": get_dab_html(dab_num, title),
|
||||
}
|
||||
self.dab_list.append(dab)
|
||||
self.dab_order.append(title)
|
||||
self.dab_lookup[dab_num] = title
|
||||
|
||||
def get_html(self) -> str:
|
||||
"""Return the processed article HTML."""
|
||||
html: str = lxml.html.tostring(self.root, encoding=str)
|
||||
return html
|
114
cmdline.py
Executable file
114
cmdline.py
Executable file
|
@ -0,0 +1,114 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import collections
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import typing
|
||||
|
||||
from add_links import api
|
||||
|
||||
# from_title = sys.argv[1]
|
||||
|
||||
re_disambig = re.compile(r"^(.*) \((.*)\)$")
|
||||
|
||||
|
||||
def article_title_to_search_query(title: str) -> str:
|
||||
"""Convert from article title to search query string."""
|
||||
m = re_disambig.match(title)
|
||||
return f'"{m.group(1)}" AND "{m.group(2)}"' if m else f'"{title}"'
|
||||
|
||||
|
||||
def run_search(q: str, limit: int | str = "max") -> dict[str, typing.Any]:
|
||||
"""Search Wikipedia."""
|
||||
params = {"list": "search", "srwhat": "text", "srlimit": limit, "srsearch": q}
|
||||
return typing.cast(dict[str, typing.Any], api.api_get(params)["query"])
|
||||
|
||||
|
||||
def search_no_link(q: str) -> tuple[int, list[dict[str, str | int]]]:
|
||||
"""Search for mentions of article title with no link included."""
|
||||
query = run_search(article_title_to_search_query(q) + f' -linksto:"{q}"', "max")
|
||||
totalhits = query["searchinfo"]["totalhits"]
|
||||
results = query["search"]
|
||||
return (totalhits, results)
|
||||
|
||||
|
||||
def search_count(q: str) -> int:
|
||||
"""How often does this article title appear in Wikipedia."""
|
||||
query = run_search(article_title_to_search_query(q), limit=0)
|
||||
return typing.cast(int, query["searchinfo"]["totalhits"]) - 1
|
||||
|
||||
|
||||
def search_count_with_link(q: str) -> int:
|
||||
"""How often does this article title appear in Wikipedia."""
|
||||
query = run_search(article_title_to_search_query(q) + f' linksto:"{q}"', limit=0)
|
||||
return typing.cast(int, query["searchinfo"]["totalhits"])
|
||||
|
||||
|
||||
def parse_contribs() -> list[tuple[str, int]]:
|
||||
re_comment = re.compile(r"^link \[\[(.*)\]\] using")
|
||||
|
||||
links: collections.Counter[str] = collections.Counter()
|
||||
|
||||
for line in open("../wikipedia-contribs/contribs"):
|
||||
if (
|
||||
'"comment": "link ' not in line
|
||||
or "using [[User:Edward/Find link|Find link]]" not in line
|
||||
):
|
||||
continue
|
||||
comment = json.loads(line)["comment"]
|
||||
|
||||
m = re_comment.match(comment)
|
||||
if not m:
|
||||
continue
|
||||
link = m.group(1)
|
||||
|
||||
if "|" not in link:
|
||||
links[link] += 1
|
||||
|
||||
return links.most_common(200)
|
||||
|
||||
|
||||
with open("examples") as f:
|
||||
seen = {json.loads(line)["title"] for line in f}
|
||||
|
||||
|
||||
out = open("examples", "a")
|
||||
for from_title, num in parse_contribs():
|
||||
if from_title in seen:
|
||||
continue
|
||||
count = search_count(from_title)
|
||||
count_with_link = search_count_with_link(from_title)
|
||||
ratio = float(count_with_link) / float(count)
|
||||
|
||||
print(from_title, count, count_with_link, f"{ratio:.1%}")
|
||||
print(
|
||||
json.dumps(
|
||||
{"title": from_title, "total": count, "with_links": count_with_link}
|
||||
),
|
||||
file=out,
|
||||
)
|
||||
out.flush()
|
||||
time.sleep(0.1)
|
||||
out.close()
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
count = search_count(from_title)
|
||||
count_with_link = search_count_with_link(from_title)
|
||||
ratio = float(count_with_link) / float(count)
|
||||
|
||||
print(count, count_with_link, f"{ratio:.1%}")
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
totalhits, search_hits = search_no_link(from_title)
|
||||
|
||||
for hit in search_hits:
|
||||
print(" ", hit)
|
||||
print(count, count_with_link, f"{ratio:.1%}", totalhits, len(search_hits))
|
||||
|
||||
# ret = core.do_search(from_title)
|
||||
# print(ret)
|
23
frontend/.eslintrc.js
Normal file
23
frontend/.eslintrc.js
Normal file
|
@ -0,0 +1,23 @@
|
|||
module.exports = {
|
||||
"env": {
|
||||
"browser": true,
|
||||
"es6": true
|
||||
},
|
||||
"extends": [
|
||||
"plugin:vue/essential",
|
||||
"standard"
|
||||
],
|
||||
"globals": {
|
||||
"Atomics": "readonly",
|
||||
"SharedArrayBuffer": "readonly"
|
||||
},
|
||||
"parserOptions": {
|
||||
"ecmaVersion": 14,
|
||||
"sourceType": "module"
|
||||
},
|
||||
"plugins": [
|
||||
"vue"
|
||||
],
|
||||
"rules": {
|
||||
}
|
||||
};
|
55
frontend/App.vue
Normal file
55
frontend/App.vue
Normal file
|
@ -0,0 +1,55 @@
|
|||
<template>
|
||||
Hello world: {{ title }}
|
||||
|
||||
<div v-for="hit in this.hits" class="mt-3">
|
||||
<div><strong>{{ hit.title }}</strong> ({{ hit.wordcount }} words)</div>
|
||||
<div v-html="hit.snippet"></div>
|
||||
<table v-html="hit.diff"></table>
|
||||
<div>replacement: {{ hit.replacement }}</div>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script>
|
||||
import axios from "redaxios";
|
||||
|
||||
export default {
|
||||
props: {
|
||||
title: String,
|
||||
api_base_url: String,
|
||||
},
|
||||
data() {
|
||||
return {
|
||||
hits: [],
|
||||
};
|
||||
},
|
||||
computed: {
|
||||
},
|
||||
watch: {
|
||||
},
|
||||
methods: {
|
||||
api_call(endpoint, options) {
|
||||
var url = `${this.api_base_url}/${endpoint}`;
|
||||
return axios.get(url, options).catch(this.show_api_error_modal);
|
||||
},
|
||||
add_hit(hit) {
|
||||
var params = { link_from: hit.title, link_to: this.title };
|
||||
this.api_call("valid_hit", { params: params}).then((response) => {
|
||||
if (response.data.valid) {
|
||||
hit.diff = response.data.diff
|
||||
hit.replacement = response.data.replacement
|
||||
this.hits.push(hit);
|
||||
}
|
||||
});
|
||||
}
|
||||
},
|
||||
mounted() {
|
||||
var params = { title: this.title }
|
||||
this.api_call("hits", { params: params}).then((response) => {
|
||||
response.data.hits.forEach((hit) => { this.add_hit(hit) });
|
||||
});
|
||||
}
|
||||
};
|
||||
</script>
|
||||
|
||||
<style>
|
||||
</style>
|
7
frontend/entry.js
Normal file
7
frontend/entry.js
Normal file
|
@ -0,0 +1,7 @@
|
|||
import {createApp} from 'vue';
|
||||
import App from './App.vue';
|
||||
|
||||
export default function(props) {
|
||||
const app = createApp(App, props).mount('#app');
|
||||
return app;
|
||||
}
|
18
package.json
Normal file
18
package.json
Normal file
|
@ -0,0 +1,18 @@
|
|||
{
|
||||
"name": "add-links",
|
||||
"version": "0.0.1",
|
||||
"scripts": {
|
||||
"dev": "vite",
|
||||
"build": "vite build"
|
||||
},
|
||||
"dependencies": {
|
||||
"bootstrap": "^5.2.3",
|
||||
"vue": "^3.3.4"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@vitejs/plugin-vue": "^4.2.3",
|
||||
"eslint": "^8.41.0",
|
||||
"eslint-plugin-vue": "^9.13.0",
|
||||
"vite": "^4.3.8"
|
||||
}
|
||||
}
|
10
templates/all_done.html
Normal file
10
templates/all_done.html
Normal file
|
@ -0,0 +1,10 @@
|
|||
{% extends "base.html" %}
|
||||
|
||||
{% block title %}Index{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="container">
|
||||
<h1>All done</h1>
|
||||
<div><a href="{{ url_for('index') }}">back to index </a></div>
|
||||
</div>
|
||||
{% endblock %}
|
56
templates/article.html
Normal file
56
templates/article.html
Normal file
|
@ -0,0 +1,56 @@
|
|||
{% extends "base.html" %}
|
||||
|
||||
{% block title %}{{ title }}{% endblock %}
|
||||
|
||||
{% block style %}
|
||||
<style>
|
||||
|
||||
span.exact { padding: 2px; background: green; color: white; font-weight: bold; }
|
||||
span.nomatch { padding: 2px; background: red; color: white; font-weight: bold; }
|
||||
span.case_mismatch { padding: 2px; background: orange; color: white; font-weight: bold; }
|
||||
span.searchmatch { font-weight: bold; }
|
||||
|
||||
table.diff,td.diff-otitle,td.diff-ntitle{background-color:white}
|
||||
td.diff-otitle,td.diff-ntitle{text-align:center}
|
||||
td.diff-marker{text-align:right;font-weight:bold;font-size:1.25em}
|
||||
td.diff-lineno{font-weight:bold}
|
||||
td.diff-addedline,td.diff-deletedline,td.diff-context{font-size:88%;vertical-align:top;white-space:-moz-pre-wrap;white-space:pre-wrap}
|
||||
td.diff-addedline,td.diff-deletedline{border-style:solid;border-width:1px 1px 1px 4px;border-radius:0.33em}
|
||||
td.diff-addedline{border-color:#a3d3ff}
|
||||
td.diff-deletedline{border-color:#ffe49c}
|
||||
td.diff-context{background:#f3f3f3;color:#333333;border-style:solid;border-width:1px 1px 1px 4px;border-color:#e6e6e6;border-radius:0.33em}
|
||||
.diffchange{font-weight:bold;text-decoration:none}
|
||||
table.diff{border:none;width:98%;border-spacing:4px; table-layout:fixed}
|
||||
td.diff-addedline .diffchange,td.diff-deletedline .diffchange{border-radius:0.33em;padding:0.25em 0}
|
||||
td.diff-addedline .diffchange{background:#d8ecff}
|
||||
td.diff-deletedline .diffchange{background:#feeec8}
|
||||
table.diff td{padding:0.33em 0.66em}
|
||||
table.diff col.diff-marker{width:2%}
|
||||
table.diff col.diff-content{width:48%}
|
||||
table.diff td div{ word-wrap:break-word; overflow:auto}
|
||||
</style>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="container">
|
||||
<h1>{{ self.title() }}</h1>
|
||||
<form>
|
||||
<input name="q">
|
||||
<input type="submit" value="search">
|
||||
</form>
|
||||
<div id="app"></div>
|
||||
</div>
|
||||
|
||||
<script type="module">
|
||||
import main from {{ url_for('static', filename='add_links.es.js') | tojson }};
|
||||
const props = {
|
||||
title: {{ title | tojson }},
|
||||
api_base_url: "/api/1"
|
||||
}
|
||||
main(props);
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
{% endblock %}
|
||||
|
66
templates/article2.html
Normal file
66
templates/article2.html
Normal file
|
@ -0,0 +1,66 @@
|
|||
{% extends "base.html" %}
|
||||
|
||||
{% block title %}{{ title }}{% endblock %}
|
||||
|
||||
{% block style %}
|
||||
<style>
|
||||
|
||||
span.exact { padding: 2px; background: green; color: white; font-weight: bold; }
|
||||
span.nomatch { padding: 2px; background: red; color: white; font-weight: bold; }
|
||||
span.case_mismatch { padding: 2px; background: orange; color: white; font-weight: bold; }
|
||||
span.searchmatch { font-weight: bold; }
|
||||
|
||||
table.diff,td.diff-otitle,td.diff-ntitle{background-color:white}
|
||||
td.diff-otitle,td.diff-ntitle{text-align:center}
|
||||
td.diff-marker{text-align:right;font-weight:bold;font-size:1.25em}
|
||||
td.diff-lineno{font-weight:bold}
|
||||
td.diff-addedline,td.diff-deletedline,td.diff-context{font-size:88%;vertical-align:top;white-space:-moz-pre-wrap;white-space:pre-wrap}
|
||||
td.diff-addedline,td.diff-deletedline{border-style:solid;border-width:1px 1px 1px 4px;border-radius:0.33em}
|
||||
td.diff-addedline{border-color:#a3d3ff}
|
||||
td.diff-deletedline{border-color:#ffe49c}
|
||||
td.diff-context{background:#f3f3f3;color:#333333;border-style:solid;border-width:1px 1px 1px 4px;border-color:#e6e6e6;border-radius:0.33em}
|
||||
.diffchange{font-weight:bold;text-decoration:none}
|
||||
table.diff{border:none;width:98%;border-spacing:4px; table-layout:fixed}
|
||||
td.diff-addedline .diffchange,td.diff-deletedline .diffchange{border-radius:0.33em;padding:0.25em 0}
|
||||
td.diff-addedline .diffchange{background:#d8ecff}
|
||||
td.diff-deletedline .diffchange{background:#feeec8}
|
||||
table.diff td{padding:0.33em 0.66em}
|
||||
table.diff col.diff-marker{width:2%}
|
||||
table.diff col.diff-content{width:48%}
|
||||
table.diff td div{ word-wrap:break-word; overflow:auto}
|
||||
</style>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="container">
|
||||
<h1>{{ self.title() }}</h1>
|
||||
<form action="{{ url_for("index") }}">
|
||||
<input name="q">
|
||||
<input type="submit" value="search">
|
||||
</form>
|
||||
|
||||
<div>Username: {{ g.user }}</div>
|
||||
|
||||
<div><a href="https://en.wikipedia.org/wiki/{{ title }}" target="_blank">view article</a></div>
|
||||
|
||||
<div><a href="{{ url_for('index') }}">back to index </a></div>
|
||||
|
||||
<div>total: {{ total }}</div>
|
||||
<div>with link: {{ with_link }}</div>
|
||||
<div>ratio: {{ "{:.1%}".format(with_link / total) }}</div>
|
||||
<div>hit: {{ hit }}</div>
|
||||
<div>replacement: {{ found.replacement }}</div>
|
||||
<div>section: {{ found.section }}</div>
|
||||
<table>
|
||||
{{ diff | safe }}
|
||||
</table>
|
||||
<form method="POST">
|
||||
<input type="hidden" name="hit" value="{{ hit.title }}">
|
||||
<div class="my-3">
|
||||
<input type="submit" class="btn btn-primary" value="save"/>
|
||||
<a href="{{url_for("article_page", url_title=url_title, after=hit["title"])}}" class="btn btn-primary">skip</a>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
22
templates/base.html
Normal file
22
templates/base.html
Normal file
|
@ -0,0 +1,22 @@
|
|||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<link href="{{ url_for("static", filename="bootstrap/css/bootstrap.min.css") }}" rel="stylesheet">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
|
||||
<title>
|
||||
{% block title %}{% endblock %}
|
||||
</title>
|
||||
|
||||
{% block style %}{% endblock %}
|
||||
</head>
|
||||
|
||||
<body>
|
||||
{% block content %}{% endblock %}
|
||||
|
||||
<script src="{{ url_for("static", filename="bootstrap/js/bootstrap.bundle.min.js")}}></script>
|
||||
|
||||
{% block script %}{% endblock %}
|
||||
</body>
|
||||
</html>
|
25
templates/index.html
Normal file
25
templates/index.html
Normal file
|
@ -0,0 +1,25 @@
|
|||
{% extends "base.html" %}
|
||||
|
||||
{% block title %}Index{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="container">
|
||||
<h1>Index</h1>
|
||||
<form>
|
||||
<input name="q">
|
||||
<input type="submit" value="search">
|
||||
</form>
|
||||
|
||||
<div>Username: {{ g.user }}</div>
|
||||
|
||||
<table class="table w-auto">
|
||||
{% for item in examples %}
|
||||
<tr>
|
||||
<td><a href="{{ article_url(item.title) }}">{{ item.title }}</a></td>
|
||||
<td>{{ item.total }}</td>
|
||||
<td>{{ "{:.1%}".format(item.with_links / item.total) }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</table>
|
||||
</div>
|
||||
{% endblock %}
|
10
templates/save_done.html
Normal file
10
templates/save_done.html
Normal file
|
@ -0,0 +1,10 @@
|
|||
{% extends "base.html" %}
|
||||
|
||||
{% block title %}Index{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="container">
|
||||
<h1>Save done</h1>
|
||||
<div>Save is complete.</div>
|
||||
</div>
|
||||
{% endblock %}
|
17
vite.config.js
Normal file
17
vite.config.js
Normal file
|
@ -0,0 +1,17 @@
|
|||
import { defineConfig } from 'vite'
|
||||
import vue from '@vitejs/plugin-vue'
|
||||
import path from 'path'
|
||||
|
||||
export default defineConfig({
|
||||
plugins: [vue()],
|
||||
define: {
|
||||
'process.env.NODE_ENV': JSON.stringify('production'),
|
||||
},
|
||||
build: {
|
||||
lib: {
|
||||
entry: path.resolve(__dirname, 'frontend/entry.js'),
|
||||
name: 'AddLinks',
|
||||
fileName: (format) => `add_links.${format}.js`,
|
||||
},
|
||||
},
|
||||
})
|
362
web_view.py
Executable file
362
web_view.py
Executable file
|
@ -0,0 +1,362 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import html
|
||||
import itertools
|
||||
import json
|
||||
import re
|
||||
import typing
|
||||
|
||||
import flask
|
||||
import werkzeug
|
||||
from requests_oauthlib import OAuth1Session
|
||||
from werkzeug.wrappers.response import Response
|
||||
|
||||
from add_links import api, core, mediawiki_api, wikidata_oauth
|
||||
from add_links.match import NoMatch, get_diff, get_match
|
||||
|
||||
app = flask.Flask(__name__)
|
||||
app.config.from_object("config.default")
|
||||
app.debug = True
|
||||
|
||||
wiki_hostname = "en.wikipedia.org"
|
||||
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
|
||||
wiki_index_php = f"https://{wiki_hostname}/w/index.php"
|
||||
|
||||
|
||||
class Hit(typing.TypedDict):
|
||||
"""Candidate articles."""
|
||||
|
||||
ns: int
|
||||
title: str
|
||||
pageid: int
|
||||
size: int
|
||||
wordcount: int
|
||||
snippet: str
|
||||
timestamp: str
|
||||
|
||||
|
||||
re_disambig = re.compile(r"^(.*) \((.*)\)$")
|
||||
|
||||
|
||||
def load_examples() -> list[dict[str, str | int]]:
|
||||
"""Load examples."""
|
||||
return [json.loads(line) for line in open("examples")]
|
||||
|
||||
|
||||
def article_title_to_search_query(title: str) -> str:
|
||||
"""Convert from article title to search query string."""
|
||||
m = re_disambig.match(title)
|
||||
return f'"{m.group(1)}" AND "{m.group(2)}"' if m else f'"{title}"'
|
||||
|
||||
|
||||
def run_search(q: str, limit: int | str = "max") -> dict[str, typing.Any]:
|
||||
"""Search Wikipedia."""
|
||||
params = {"list": "search", "srwhat": "text", "srlimit": limit, "srsearch": q}
|
||||
return typing.cast(dict[str, typing.Any], api.api_get(params)["query"])
|
||||
|
||||
|
||||
def article_url(title: str) -> str:
|
||||
"""URL for search page."""
|
||||
return flask.url_for("article_page", url_title=title.replace(" ", "_"))
|
||||
|
||||
|
||||
def search_count(q: str) -> int:
|
||||
"""How often does this article title appear in Wikipedia."""
|
||||
query = run_search(article_title_to_search_query(q), limit=0)
|
||||
return typing.cast(int, query["searchinfo"]["totalhits"]) - 1
|
||||
|
||||
|
||||
def search_count_with_link(q: str) -> int:
|
||||
"""How often does this article title appear in Wikipedia."""
|
||||
query = run_search(article_title_to_search_query(q) + f' linksto:"{q}"', limit=0)
|
||||
return typing.cast(int, query["searchinfo"]["totalhits"])
|
||||
|
||||
|
||||
def search_no_link(q: str) -> tuple[int, list[Hit]]:
|
||||
"""Search for mentions of article title with no link included."""
|
||||
query = run_search(article_title_to_search_query(q) + f' -linksto:"{q}"', "max")
|
||||
totalhits = query["searchinfo"]["totalhits"]
|
||||
results = query["search"]
|
||||
return (totalhits, results)
|
||||
|
||||
|
||||
@app.before_request
|
||||
def global_user() -> None:
|
||||
"""Make username available everywhere."""
|
||||
flask.g.user = wikidata_oauth.get_username()
|
||||
|
||||
|
||||
@app.route("/")
|
||||
def index() -> str | Response:
|
||||
"""Index page."""
|
||||
if "oauth_verifier" in flask.request.args and "oauth_token" in flask.request.args:
|
||||
return flask.redirect(flask.url_for("oauth_callback", **flask.request.args))
|
||||
|
||||
examples = load_examples()
|
||||
examples.sort(
|
||||
key=lambda i: float(i["with_links"]) / float(i["total"]), reverse=True
|
||||
)
|
||||
|
||||
if q := flask.request.args.get("q"):
|
||||
if q_trimmed := q.strip():
|
||||
return flask.redirect(article_url(q_trimmed))
|
||||
|
||||
return flask.render_template(
|
||||
"index.html", examples=examples, article_url=article_url
|
||||
)
|
||||
|
||||
|
||||
def case_flip(s: str) -> str:
|
||||
"""Switch case of character."""
|
||||
if s.islower():
|
||||
return s.upper()
|
||||
if s.isupper():
|
||||
return s.lower()
|
||||
return s
|
||||
|
||||
|
||||
def case_flip_first(s: str) -> str:
|
||||
"""Switch case of first character in string."""
|
||||
return case_flip(s[0]) + s[1:]
|
||||
|
||||
|
||||
def tidy_snippet(snippet: str) -> str:
|
||||
"""Remove HTML from snippet."""
|
||||
snippet = snippet.replace("\u2013", "-")
|
||||
snippet = snippet.replace("</span>", "")
|
||||
snippet = snippet.replace('<span class="searchmatch">', "")
|
||||
return html.unescape(snippet)
|
||||
|
||||
|
||||
@app.route("/oauth/start")
|
||||
def start_oauth() -> Response:
|
||||
"""Start OAuth."""
|
||||
next_page = flask.request.args.get("next")
|
||||
if next_page:
|
||||
flask.session["after_login"] = next_page
|
||||
|
||||
client_key = app.config["CLIENT_KEY"]
|
||||
client_secret = app.config["CLIENT_SECRET"]
|
||||
request_token_url = wiki_index_php + "?title=Special%3aOAuth%2finitiate"
|
||||
|
||||
oauth = OAuth1Session(client_key, client_secret=client_secret, callback_uri="oob")
|
||||
fetch_response = oauth.fetch_request_token(request_token_url)
|
||||
|
||||
flask.session["owner_key"] = fetch_response.get("oauth_token")
|
||||
flask.session["owner_secret"] = fetch_response.get("oauth_token_secret")
|
||||
|
||||
base_authorization_url = f"https://{wiki_hostname}/wiki/Special:OAuth/authorize"
|
||||
authorization_url = oauth.authorization_url(
|
||||
base_authorization_url, oauth_consumer_key=client_key
|
||||
)
|
||||
return flask.redirect(authorization_url)
|
||||
|
||||
|
||||
@app.route("/oauth/callback", methods=["GET"])
|
||||
def oauth_callback() -> werkzeug.wrappers.response.Response:
|
||||
"""Oauth callback."""
|
||||
client_key = app.config["CLIENT_KEY"]
|
||||
client_secret = app.config["CLIENT_SECRET"]
|
||||
|
||||
oauth = OAuth1Session(
|
||||
client_key,
|
||||
client_secret=client_secret,
|
||||
resource_owner_key=flask.session["owner_key"],
|
||||
resource_owner_secret=flask.session["owner_secret"],
|
||||
)
|
||||
|
||||
oauth_response = oauth.parse_authorization_response(flask.request.url)
|
||||
verifier = oauth_response.get("oauth_verifier")
|
||||
access_token_url = wiki_index_php + "?title=Special%3aOAuth%2ftoken"
|
||||
oauth = OAuth1Session(
|
||||
client_key,
|
||||
client_secret=client_secret,
|
||||
resource_owner_key=flask.session["owner_key"],
|
||||
resource_owner_secret=flask.session["owner_secret"],
|
||||
verifier=verifier,
|
||||
)
|
||||
|
||||
oauth_tokens = oauth.fetch_access_token(access_token_url)
|
||||
flask.session["owner_key"] = oauth_tokens.get("oauth_token")
|
||||
flask.session["owner_secret"] = oauth_tokens.get("oauth_token_secret")
|
||||
|
||||
print("login successful")
|
||||
|
||||
next_page = flask.session.get("after_login")
|
||||
return flask.redirect(next_page if next_page else flask.url_for("index"))
|
||||
|
||||
|
||||
@app.route("/oauth/disconnect")
|
||||
def oauth_disconnect() -> werkzeug.wrappers.response.Response:
|
||||
"""Disconnect OAuth."""
|
||||
for key in "owner_key", "owner_secret", "username", "after_login":
|
||||
if key in flask.session:
|
||||
del flask.session[key]
|
||||
return flask.redirect(flask.url_for("index"))
|
||||
|
||||
|
||||
def match_type(q: str, snippet: str) -> str | None:
|
||||
"""Discover match type, ''exact', 'case_mismatch' or None.
|
||||
|
||||
>>> match_type('foo', 'foo')
|
||||
'exact'
|
||||
>>> match_type('foo', 'bar') is None
|
||||
True
|
||||
>>> match_type('bar', 'foo bar baz')
|
||||
'exact'
|
||||
>>> match_type('clean coal technology', 'foo clean coal technologies baz')
|
||||
'exact'
|
||||
>>> match_type('bar', 'foo Bar baz')
|
||||
'exact'
|
||||
>>> match_type('bar', 'foo BAR baz')
|
||||
'case_mismatch'
|
||||
>>> match_type('foo-bar', 'aa foo-bar cc')
|
||||
'exact'
|
||||
>>> match_type(u'foo\u2013bar', 'aa foo-bar cc')
|
||||
'exact'
|
||||
"""
|
||||
q = q.replace("\u2013", "-")
|
||||
snippet = tidy_snippet(snippet)
|
||||
|
||||
if q in snippet or case_flip_first(q) in snippet:
|
||||
return "exact"
|
||||
match = None
|
||||
if q.lower() in snippet.lower():
|
||||
match = "case_mismatch"
|
||||
if match != "exact" and q.endswith("y"):
|
||||
if q[:-1] in snippet or case_flip_first(q[:-1]) in snippet:
|
||||
return "exact"
|
||||
elif match is None:
|
||||
if q[:-1].lower() in snippet.lower():
|
||||
match = "case_mismatch"
|
||||
return match
|
||||
|
||||
|
||||
class NoGoodHit(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def get_best_hit(title: str, hits: list[Hit]) -> tuple[Hit, dict[str, typing.Any]]:
|
||||
"""Find the best hit within the search results."""
|
||||
for hit in hits:
|
||||
if hit["title"].lower() == title.lower():
|
||||
continue
|
||||
if match_type(title, hit["snippet"]) != "exact":
|
||||
continue
|
||||
|
||||
try:
|
||||
print(f'get diff: {hit["title"]}, {title}')
|
||||
found = get_diff(title, hit["title"], None)
|
||||
except NoMatch:
|
||||
print("no match")
|
||||
continue
|
||||
|
||||
return (hit, found)
|
||||
|
||||
raise NoGoodHit
|
||||
|
||||
|
||||
@app.route("/<path:url_title>", methods=["GET", "POST"])
|
||||
def article_page(url_title: str) -> str | Response:
|
||||
"""Article page."""
|
||||
from_title = url_title.replace("_", " ").strip()
|
||||
|
||||
if flask.request.method == "POST":
|
||||
hit_title = flask.request.form["hit"]
|
||||
do_save(from_title, hit_title)
|
||||
return flask.redirect(
|
||||
flask.url_for("article_page", url_title=url_title, after=hit_title)
|
||||
)
|
||||
|
||||
total = search_count(from_title)
|
||||
with_link = search_count_with_link(from_title)
|
||||
|
||||
no_link_count, hits = search_no_link(from_title)
|
||||
|
||||
after = flask.request.args.get("after")
|
||||
if after:
|
||||
print(after)
|
||||
hits_iter = itertools.dropwhile(lambda hit: hit["title"] != after, hits)
|
||||
skip = next(hits_iter, None)
|
||||
if skip:
|
||||
hits = list(hits_iter)
|
||||
|
||||
try:
|
||||
hit, found = get_best_hit(from_title, hits)
|
||||
except NoGoodHit:
|
||||
return flask.render_template("all_done.html")
|
||||
|
||||
return flask.render_template(
|
||||
"article2.html",
|
||||
title=from_title,
|
||||
total=total,
|
||||
with_link=with_link,
|
||||
hit=hit,
|
||||
replacement=found["replacement"],
|
||||
diff=found["diff"],
|
||||
found=found,
|
||||
url_title=url_title,
|
||||
)
|
||||
|
||||
|
||||
def do_save(title: str, hit_title: str) -> str:
|
||||
"""Update page on Wikipedia."""
|
||||
token = wikidata_oauth.get_token()
|
||||
|
||||
found = get_match(title, hit_title, None)
|
||||
|
||||
summary = (
|
||||
f"link [[{found['replacement']}]] using [[:en:User:Edward/Find link|Find link]]"
|
||||
)
|
||||
|
||||
edit = mediawiki_api.edit_page(
|
||||
pageid=found["pageid"],
|
||||
section=found["section_num"],
|
||||
text=found["section_text"],
|
||||
summary=summary,
|
||||
baserevid=found["revid"],
|
||||
token=token,
|
||||
)
|
||||
|
||||
return edit
|
||||
|
||||
|
||||
@app.route("/saved")
|
||||
def save_done() -> str:
|
||||
"""Save complete."""
|
||||
return flask.render_template("save_done.html")
|
||||
|
||||
|
||||
@app.route("/api/1/hits")
|
||||
def api_hits() -> werkzeug.wrappers.response.Response:
|
||||
"""Return canidates for the given article title."""
|
||||
title = flask.request.args.get("title")
|
||||
assert title
|
||||
ret = core.do_search(title)
|
||||
return flask.jsonify(title=title, hits=ret["results"])
|
||||
|
||||
# mock_hits: list[Hit] = json.load(open("sample.json"))
|
||||
# return flask.jsonify(title=title, hits=mock_hits)
|
||||
|
||||
|
||||
@app.route("/api/1/valid_hit")
|
||||
def api_valid_hit() -> werkzeug.wrappers.response.Response:
|
||||
"""Return canidates for the given article title."""
|
||||
link_from = flask.request.args.get("link_from")
|
||||
link_to = flask.request.args.get("link_to")
|
||||
|
||||
try:
|
||||
diff, replacement = get_diff(link_to, link_from, None)
|
||||
except NoMatch:
|
||||
return flask.jsonify(valid=False)
|
||||
|
||||
return flask.jsonify(valid=True, diff=diff, replacement=replacement)
|
||||
|
||||
|
||||
@app.route("/favicon.ico")
|
||||
def favicon() -> None:
|
||||
flask.abort(404)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(host="0.0.0.0", port=8000)
|
Loading…
Reference in a new issue