Initial commit
This commit is contained in:
commit
f07b407e7a
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
|
@ -0,0 +1,4 @@
|
|||
__pycache__
|
||||
.mypy_cache/
|
||||
node_modules
|
||||
package-lock.json
|
22
add_front_end_libraries.py
Executable file
22
add_front_end_libraries.py
Executable file
|
@ -0,0 +1,22 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
STATIC_DIR = "static"
|
||||
|
||||
assert os.path.exists("package.json") and os.path.exists("node_modules")
|
||||
|
||||
if not os.path.exists(STATIC_DIR):
|
||||
os.mkdir(STATIC_DIR)
|
||||
|
||||
shutil.copytree(
|
||||
"node_modules/bootstrap/dist/",
|
||||
os.path.join(STATIC_DIR, "bootstrap"),
|
||||
dirs_exist_ok=True,
|
||||
)
|
||||
|
||||
subprocess.run(["npm", "run", "build"], check=True)
|
||||
|
||||
shutil.copy("dist/add_links.es.js", "static")
|
0
add_links/__init__.py
Normal file
0
add_links/__init__.py
Normal file
284
add_links/api.py
Normal file
284
add_links/api.py
Normal file
|
@ -0,0 +1,284 @@
|
|||
import re
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from simplejson.scanner import JSONDecodeError
|
||||
|
||||
from .language import get_current_language
|
||||
from .util import is_disambig
|
||||
|
||||
ua = (
|
||||
"find-link/2.2 "
|
||||
+ "(https://github.com/EdwardBetts/find_link; contact: edward@4angle.com)"
|
||||
)
|
||||
re_disambig = re.compile(r"^(.*) \((.*)\)$")
|
||||
|
||||
|
||||
def get_query_url() -> str:
|
||||
"""Get the wikipedia query API for the current language."""
|
||||
return f"https://{get_current_language()}.wikipedia.org/w/api.php"
|
||||
|
||||
|
||||
sessions = {}
|
||||
|
||||
|
||||
def get_session():
|
||||
lang = get_current_language()
|
||||
if lang in sessions:
|
||||
return sessions[lang]
|
||||
s = requests.Session()
|
||||
s.headers = {"User-Agent": ua}
|
||||
s.mount("https://en.wikipedia.org", HTTPAdapter(max_retries=10))
|
||||
s.params = {
|
||||
"format": "json",
|
||||
"action": "query",
|
||||
"formatversion": 2,
|
||||
}
|
||||
sessions[lang] = s
|
||||
return s
|
||||
|
||||
|
||||
class MediawikiError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class MultipleRedirects(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class IncompleteReply(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class MissingPage(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def check_for_error(json_data):
|
||||
if "error" in json_data:
|
||||
raise MediawikiError(json_data["error"]["info"])
|
||||
|
||||
|
||||
webpage_error = (
|
||||
"Our servers are currently under maintenance or experiencing a technical problem."
|
||||
)
|
||||
|
||||
|
||||
def api_get(params: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Make call to Wikipedia API."""
|
||||
s = get_session()
|
||||
|
||||
r = s.get(get_query_url(), params=params)
|
||||
try:
|
||||
ret = r.json()
|
||||
except JSONDecodeError:
|
||||
if webpage_error in r.text:
|
||||
raise MediawikiError(webpage_error)
|
||||
else:
|
||||
raise MediawikiError("unknown error")
|
||||
check_for_error(ret)
|
||||
return ret
|
||||
|
||||
|
||||
def get_first_page(params: dict[str, str]) -> dict[str, Any]:
|
||||
"""Run Wikipedia API query and return the first page."""
|
||||
page = api_get(params)["query"]["pages"][0]
|
||||
if page.get("missing"):
|
||||
raise MissingPage
|
||||
return page
|
||||
|
||||
|
||||
def random_article_list(limit=50):
|
||||
params = {
|
||||
"list": "random",
|
||||
"rnnamespace": "0",
|
||||
"rnlimit": limit,
|
||||
}
|
||||
|
||||
return api_get(params)["query"]["random"]
|
||||
|
||||
|
||||
def wiki_search(q):
|
||||
m = re_disambig.match(q)
|
||||
if m:
|
||||
search = '"{}" AND "{}"'.format(*m.groups())
|
||||
else:
|
||||
search = '"{}"'.format(q)
|
||||
|
||||
params = {
|
||||
"list": "search",
|
||||
"srwhat": "text",
|
||||
"srlimit": 50,
|
||||
"srsearch": search,
|
||||
"continue": "",
|
||||
}
|
||||
ret = api_get(params)
|
||||
query = ret["query"]
|
||||
totalhits = query["searchinfo"]["totalhits"]
|
||||
results = query["search"]
|
||||
for _ in range(10):
|
||||
if "continue" not in ret:
|
||||
break
|
||||
params["sroffset"] = ret["continue"]["sroffset"]
|
||||
ret = api_get(params)
|
||||
results += ret["query"]["search"]
|
||||
return (totalhits, results)
|
||||
|
||||
|
||||
def get_wiki_info(q):
|
||||
params = {
|
||||
"prop": "info",
|
||||
"redirects": "",
|
||||
"titles": q,
|
||||
}
|
||||
ret = api_get(params)["query"]
|
||||
if "interwiki" in ret:
|
||||
return None
|
||||
redirects = []
|
||||
if ret.get("redirects"):
|
||||
redirects = ret["redirects"]
|
||||
if len(redirects) != 1:
|
||||
# multiple redirects, we should explain to the user that this is
|
||||
# unsupported
|
||||
raise MultipleRedirects
|
||||
if ret["pages"][0].get("missing"):
|
||||
raise MissingPage(q)
|
||||
return redirects[0]["to"] if redirects else None
|
||||
|
||||
|
||||
def cat_start(q: str) -> list[str]:
|
||||
"""Find categories that start with this prefix."""
|
||||
params = {
|
||||
"list": "allpages",
|
||||
"apnamespace": 14, # categories
|
||||
"apfilterredir": "nonredirects",
|
||||
"aplimit": 500,
|
||||
"apprefix": q,
|
||||
}
|
||||
ret = api_get(params)["query"]
|
||||
return [i["title"] for i in ret["allpages"] if i["title"] != q]
|
||||
|
||||
|
||||
def all_pages(q: str) -> list[str]:
|
||||
"""Get all article titles with a given prefix."""
|
||||
params = {
|
||||
"list": "allpages",
|
||||
"apnamespace": 0,
|
||||
"apfilterredir": "nonredirects",
|
||||
"aplimit": 500,
|
||||
"apprefix": q,
|
||||
}
|
||||
ret = api_get(params)["query"]
|
||||
return [i["title"] for i in ret["allpages"] if i["title"] != q]
|
||||
|
||||
|
||||
def categorymembers(q: str) -> list[str]:
|
||||
"""List of category members."""
|
||||
params = {
|
||||
"list": "categorymembers",
|
||||
"cmnamespace": 0,
|
||||
"cmlimit": 500,
|
||||
"cmtitle": q[0].upper() + q[1:],
|
||||
}
|
||||
ret = api_get(params)["query"]
|
||||
return [i["title"] for i in ret["categorymembers"] if i["title"] != q]
|
||||
|
||||
|
||||
def page_links(titles): # unused
|
||||
titles = list(titles)
|
||||
assert titles
|
||||
params = {
|
||||
"prop": "links",
|
||||
"pllimit": 500,
|
||||
"plnamespace": 0,
|
||||
"titles": "|".join(titles),
|
||||
}
|
||||
ret = api_get(params)["query"]
|
||||
return dict(
|
||||
(doc["title"], {l["title"] for l in doc["links"]})
|
||||
for doc in ret["pages"].values()
|
||||
if "links" in doc
|
||||
)
|
||||
|
||||
|
||||
def find_disambig(titles: list[str]) -> list[str]:
|
||||
"""Find disambiguation articles in the given list of titles."""
|
||||
titles = list(titles)
|
||||
assert titles
|
||||
pos = 0
|
||||
disambig: list[str] = []
|
||||
params = {
|
||||
"prop": "templates",
|
||||
"tllimit": 500,
|
||||
"tlnamespace": 10, # templates
|
||||
"continue": "",
|
||||
}
|
||||
while pos < len(titles):
|
||||
params["titles"] = "|".join(titles[pos : pos + 50])
|
||||
ret = api_get(params)
|
||||
disambig.extend(
|
||||
doc["title"] for doc in ret["query"]["pages"] if is_disambig(doc)
|
||||
)
|
||||
for i in range(10):
|
||||
if "continue" not in ret:
|
||||
break
|
||||
tlcontinue = ret["continue"]["tlcontinue"]
|
||||
params["titles"] = "|".join(titles[pos : pos + 50])
|
||||
params["tlcontinue"] = tlcontinue
|
||||
ret = api_get(params)
|
||||
disambig.extend(
|
||||
doc["title"] for doc in ret["query"]["pages"] if is_disambig(doc)
|
||||
)
|
||||
pos += 50
|
||||
|
||||
return disambig
|
||||
|
||||
|
||||
def wiki_redirects(q): # pages that link here
|
||||
params = {
|
||||
"list": "backlinks",
|
||||
"blfilterredir": "redirects",
|
||||
"bllimit": 500,
|
||||
"blnamespace": 0,
|
||||
"bltitle": q,
|
||||
}
|
||||
docs = api_get(params)["query"]["backlinks"]
|
||||
assert all("redirect" in doc for doc in docs)
|
||||
return (doc["title"] for doc in docs)
|
||||
|
||||
|
||||
def wiki_backlink(q: str) -> tuple[set[str], set[str]]:
|
||||
"""Get backlinks for article."""
|
||||
params = {
|
||||
"list": "backlinks",
|
||||
"bllimit": 500,
|
||||
"blnamespace": 0,
|
||||
"bltitle": q,
|
||||
"continue": "",
|
||||
}
|
||||
ret = api_get(params)
|
||||
docs = ret["query"]["backlinks"]
|
||||
while "continue" in ret:
|
||||
params["blcontinue"] = ret["continue"]["blcontinue"]
|
||||
ret = api_get(params)
|
||||
docs += ret["query"]["backlinks"]
|
||||
|
||||
articles = {doc["title"] for doc in docs if "redirect" not in doc}
|
||||
redirects = {doc["title"] for doc in docs if "redirect" in doc}
|
||||
return (articles, redirects)
|
||||
|
||||
|
||||
def call_get_diff(title, section_num, section_text):
|
||||
data = {
|
||||
"prop": "revisions",
|
||||
"rvprop": "timestamp",
|
||||
"titles": title,
|
||||
"rvsection": section_num,
|
||||
"rvdifftotext": section_text.strip(),
|
||||
}
|
||||
|
||||
s = get_session()
|
||||
ret = s.post(get_query_url(), data=data).json()
|
||||
check_for_error(ret)
|
||||
return ret["query"]["pages"][0]["revisions"][0]["diff"]["body"]
|
198
add_links/core.py
Normal file
198
add_links/core.py
Normal file
|
@ -0,0 +1,198 @@
|
|||
"""Core functions."""
|
||||
|
||||
import html
|
||||
import re
|
||||
import typing
|
||||
from pprint import pprint
|
||||
|
||||
from .api import (
|
||||
MediawikiError,
|
||||
all_pages,
|
||||
cat_start,
|
||||
categorymembers,
|
||||
find_disambig,
|
||||
get_first_page,
|
||||
wiki_backlink,
|
||||
wiki_search,
|
||||
)
|
||||
from .util import case_flip_first, norm
|
||||
|
||||
re_redirect = re.compile(r"#REDIRECT \[\[(.)([^#]*?)(#.*)?\]\]")
|
||||
|
||||
|
||||
def get_content_and_timestamp(title: str) -> tuple[str, str]:
|
||||
"""Get article content and timestamp of last update."""
|
||||
params = {
|
||||
"prop": "revisions|info",
|
||||
"rvprop": "content|timestamp",
|
||||
"titles": title,
|
||||
}
|
||||
json_data: dict[str, typing.Any] = get_first_page(params)
|
||||
if json_data.get("invalid"):
|
||||
raise MediawikiError(json_data["invalidreason"])
|
||||
rev = json_data["revisions"][0]
|
||||
return (rev["content"], rev["timestamp"])
|
||||
|
||||
|
||||
def get_revision_info(title: str) -> dict[str, typing.Any]:
|
||||
"""Get info about latest revision of article."""
|
||||
params = {
|
||||
"prop": "revisions|info",
|
||||
"rvprop": "content|timestamp|ids",
|
||||
"titles": title,
|
||||
}
|
||||
json_data: dict[str, typing.Any] = get_first_page(params)
|
||||
if json_data.get("invalid"):
|
||||
raise MediawikiError(json_data["invalidreason"])
|
||||
revs = json_data.pop("revisions")
|
||||
ret = revs[0]
|
||||
ret["pageid"] = json_data["pageid"]
|
||||
pprint(json_data)
|
||||
return typing.cast(dict[str, typing.Any], ret)
|
||||
|
||||
|
||||
def is_redirect_to(title_from: str, title_to: str) -> bool:
|
||||
title_from = title_from.replace("_", " ")
|
||||
params = {"prop": "info", "titles": title_from}
|
||||
if "redirect" not in get_first_page(params):
|
||||
return False
|
||||
|
||||
params = {"prop": "revisions", "rvprop": "content", "titles": title_from}
|
||||
page_text = get_first_page(params)["revisions"][0]["content"]
|
||||
m = re_redirect.match(page_text)
|
||||
assert m
|
||||
title_to = title_to[0].upper() + title_to[1:]
|
||||
return m.group(1).upper() + m.group(2) == title_to
|
||||
|
||||
|
||||
def find_longer(
|
||||
q: str, search: list[dict[str, typing.Any]], articles: set[str]
|
||||
) -> list[str]:
|
||||
"""Find other articles with titles that are longer."""
|
||||
this_title = q[0].upper() + q[1:]
|
||||
longer: list[str] = all_pages(this_title)
|
||||
lq = q.lower()
|
||||
for doc in search:
|
||||
lt = doc["title"].lower()
|
||||
if lq == lt or lq not in lt:
|
||||
continue
|
||||
articles.add(doc["title"])
|
||||
more_articles, more_redirects = wiki_backlink(doc["title"])
|
||||
articles.update(more_articles)
|
||||
if doc["title"] not in longer:
|
||||
longer.append(doc["title"])
|
||||
|
||||
return longer
|
||||
|
||||
|
||||
def tidy_snippet(snippet: str) -> str:
|
||||
"""Remove HTML from snippet."""
|
||||
snippet = snippet.replace("\u2013", "-")
|
||||
snippet = snippet.replace("</span>", "")
|
||||
snippet = snippet.replace('<span class="searchmatch">', "")
|
||||
return html.unescape(snippet)
|
||||
|
||||
|
||||
def match_type(q: str, snippet: str) -> str | None:
|
||||
"""Discover match type, ''exact', 'case_mismatch' or None.
|
||||
|
||||
>>> match_type('foo', 'foo')
|
||||
'exact'
|
||||
>>> match_type('foo', 'bar') is None
|
||||
True
|
||||
>>> match_type('bar', 'foo bar baz')
|
||||
'exact'
|
||||
>>> match_type('clean coal technology', 'foo clean coal technologies baz')
|
||||
'exact'
|
||||
>>> match_type('bar', 'foo Bar baz')
|
||||
'exact'
|
||||
>>> match_type('bar', 'foo BAR baz')
|
||||
'case_mismatch'
|
||||
>>> match_type('foo-bar', 'aa foo-bar cc')
|
||||
'exact'
|
||||
>>> match_type(u'foo\u2013bar', 'aa foo-bar cc')
|
||||
'exact'
|
||||
"""
|
||||
q = q.replace("\u2013", "-")
|
||||
snippet = tidy_snippet(snippet)
|
||||
|
||||
if q in snippet or case_flip_first(q) in snippet:
|
||||
return "exact"
|
||||
match = None
|
||||
if q.lower() in snippet.lower():
|
||||
match = "case_mismatch"
|
||||
if match != "exact" and q.endswith("y"):
|
||||
if q[:-1] in snippet or case_flip_first(q[:-1]) in snippet:
|
||||
return "exact"
|
||||
elif match is None:
|
||||
if q[:-1].lower() in snippet.lower():
|
||||
match = "case_mismatch"
|
||||
return match
|
||||
|
||||
|
||||
def do_search(
|
||||
q: str, redirect_to: str | None = None
|
||||
) -> dict[str, int | list[dict[str, typing.Any]] | list[str] | None]:
|
||||
this_title = q[0].upper() + q[1:]
|
||||
|
||||
totalhits, search_hits = wiki_search(q)
|
||||
articles, redirects = wiki_backlink(redirect_to or q)
|
||||
cm = set()
|
||||
start = cat_start(q)
|
||||
if len(start) > 5:
|
||||
start = [] # big categories take too long
|
||||
for cat in set(["Category:" + this_title] + start):
|
||||
cm.update(categorymembers(cat))
|
||||
|
||||
norm_q = norm(q)
|
||||
norm_match_redirect = {r for r in redirects if norm(r) == norm_q}
|
||||
longer_redirect = {r for r in redirects if q.lower() in r.lower()}
|
||||
|
||||
articles.add(this_title)
|
||||
if redirect_to:
|
||||
articles.add(redirect_to[0].upper() + redirect_to[1:])
|
||||
|
||||
longer_redirect = {r for r in redirects if q.lower() in r.lower()}
|
||||
for r in norm_match_redirect | longer_redirect:
|
||||
articles.add(r)
|
||||
a2, r2 = wiki_backlink(r)
|
||||
articles.update(a2)
|
||||
redirects.update(r2)
|
||||
|
||||
longer = find_longer(q, search_hits, articles) if len(q) > 6 else None
|
||||
|
||||
search: list[dict[str, typing.Any]] = [
|
||||
doc
|
||||
for doc in search_hits
|
||||
if doc["title"] not in articles and doc["title"] not in cm
|
||||
]
|
||||
if search:
|
||||
disambig = set(find_disambig([doc["title"] for doc in search]))
|
||||
search = [doc for doc in search if doc["title"] not in disambig]
|
||||
# and (doc['title'] not in links or this_title not in links[doc['title']])]
|
||||
for doc in search:
|
||||
without_markup = (
|
||||
doc["snippet"]
|
||||
.replace("<span class='searchmatch'>", "")
|
||||
.replace("</span>", "")
|
||||
.replace(" ", " ")
|
||||
)
|
||||
doc["match"] = match_type(q, without_markup)
|
||||
doc["snippet_without_markup"] = without_markup
|
||||
return {
|
||||
"totalhits": totalhits,
|
||||
"results": search,
|
||||
"longer": longer,
|
||||
}
|
||||
|
||||
|
||||
def get_case_from_content(title: str) -> str | None:
|
||||
"""Check article content to find the case of the article title."""
|
||||
content, timestamp = get_content_and_timestamp(title)
|
||||
if title == title.lower() and title in content:
|
||||
return title
|
||||
start = content.lower().find("'''" + title.replace("_", " ").lower() + "'''")
|
||||
if start != -1:
|
||||
return content[start + 3 : start + 3 + len(title)]
|
||||
|
||||
return None # article doesn't contain the title
|
146
add_links/language.py
Normal file
146
add_links/language.py
Normal file
|
@ -0,0 +1,146 @@
|
|||
from flask import session, has_request_context
|
||||
|
||||
langs = [
|
||||
('af', 'Afrikaans', 'Afrikaans'),
|
||||
('als', 'Alemannisch', 'Alemannic'),
|
||||
('am', 'አማርኛ', 'Amharic'),
|
||||
('an', 'aragonés', 'Aragonese'),
|
||||
('ar', 'العربية', 'Arabic'),
|
||||
('arz', 'مصرى', 'Egyptian Arabic'),
|
||||
('ast', 'asturianu', 'Asturian'),
|
||||
('az', 'azərbaycanca', 'Azerbaijani'),
|
||||
('azb', 'تۆرکجه', 'Southern Azerbaijani'),
|
||||
('ba', 'башҡортса', 'Bashkir'),
|
||||
('bar', 'Boarisch', 'Bavarian'),
|
||||
('bat-smg', 'žemaitėška', 'Samogitian'),
|
||||
('be', 'беларуская', 'Belarusian'),
|
||||
('be-tarask', 'беларуская (тарашкевіца)', 'Belarusian (Taraškievica)'),
|
||||
('bg', 'български', 'Bulgarian'),
|
||||
('bn', 'বাংলা', 'Bengali'),
|
||||
('bpy', 'বিষ্ণুপ্রিয়া মণিপুরী', 'Bishnupriya Manipuri'),
|
||||
('br', 'brezhoneg', 'Breton'),
|
||||
('bs', 'bosanski', 'Bosnian'),
|
||||
('bug', 'ᨅᨔ ᨕᨘᨁᨗ', 'Buginese'),
|
||||
('ca', 'català', 'Catalan'),
|
||||
('ce', 'нохчийн', 'Chechen'),
|
||||
('ceb', 'Cebuano', 'Cebuano'),
|
||||
('ckb', 'کوردیی ناوەندی', 'Kurdish (Sorani)'),
|
||||
('cs', 'čeština', 'Czech'),
|
||||
('cv', 'Чӑвашла', 'Chuvash'),
|
||||
('cy', 'Cymraeg', 'Welsh'),
|
||||
('da', 'dansk', 'Danish'),
|
||||
('de', 'Deutsch', 'German'),
|
||||
('el', 'Ελληνικά', 'Greek'),
|
||||
('en', 'English', 'English'),
|
||||
('eo', 'Esperanto', 'Esperanto'),
|
||||
('es', 'español', 'Spanish'),
|
||||
('et', 'eesti', 'Estonian'),
|
||||
('eu', 'euskara', 'Basque'),
|
||||
('fa', 'فارسی', 'Persian'),
|
||||
('fi', 'suomi', 'Finnish'),
|
||||
('fo', 'føroyskt', 'Faroese'),
|
||||
('fr', 'français', 'French'),
|
||||
('fy', 'Frysk', 'West Frisian'),
|
||||
('ga', 'Gaeilge', 'Irish'),
|
||||
('gd', 'Gàidhlig', 'Scottish Gaelic'),
|
||||
('gl', 'galego', 'Galician'),
|
||||
('gu', 'ગુજરાતી', 'Gujarati'),
|
||||
('he', 'עברית', 'Hebrew'),
|
||||
('hi', 'हिन्दी', 'Hindi'),
|
||||
('hr', 'hrvatski', 'Croatian'),
|
||||
('hsb', 'hornjoserbsce', 'Upper Sorbian'),
|
||||
('ht', 'Kreyòl ayisyen', 'Haitian'),
|
||||
('hu', 'magyar', 'Hungarian'),
|
||||
('hy', 'Հայերեն', 'Armenian'),
|
||||
('ia', 'interlingua', 'Interlingua'),
|
||||
('id', 'Bahasa Indonesia', 'Indonesian'),
|
||||
('io', 'Ido', 'Ido'),
|
||||
('is', 'íslenska', 'Icelandic'),
|
||||
('it', 'italiano', 'Italian'),
|
||||
('ja', '日本語', 'Japanese'),
|
||||
('jv', 'Basa Jawa', 'Javanese'),
|
||||
('ka', 'ქართული', 'Georgian'),
|
||||
('kk', 'қазақша', 'Kazakh'),
|
||||
('kn', 'ಕನ್ನಡ', 'Kannada'),
|
||||
('ko', '한국어', 'Korean'),
|
||||
('ku', 'Kurdî', 'Kurdish (Kurmanji)'),
|
||||
('ky', 'Кыргызча', 'Kirghiz'),
|
||||
('la', 'Latina', 'Latin'),
|
||||
('lb', 'Lëtzebuergesch', 'Luxembourgish'),
|
||||
('li', 'Limburgs', 'Limburgish'),
|
||||
('lmo', 'lumbaart', 'Lombard'),
|
||||
('lt', 'lietuvių', 'Lithuanian'),
|
||||
('lv', 'latviešu', 'Latvian'),
|
||||
('map-bms', 'Basa Banyumasan', 'Banyumasan'),
|
||||
('mg', 'Malagasy', 'Malagasy'),
|
||||
('min', 'Baso Minangkabau', 'Minangkabau'),
|
||||
('mk', 'македонски', 'Macedonian'),
|
||||
('ml', 'മലയാളം', 'Malayalam'),
|
||||
('mn', 'монгол', 'Mongolian'),
|
||||
('mr', 'मराठी', 'Marathi'),
|
||||
('mrj', 'кырык мары', 'Hill Mari'),
|
||||
('ms', 'Bahasa Melayu', 'Malay'),
|
||||
('my', 'မြန်မာဘာသာ', 'Burmese'),
|
||||
('mzn', 'مازِرونی', 'Mazandarani'),
|
||||
('nah', 'Nāhuatl', 'Nahuatl'),
|
||||
('nap', 'Napulitano', 'Neapolitan'),
|
||||
('nds', 'Plattdüütsch', 'Low Saxon'),
|
||||
('ne', 'नेपाली', 'Nepali'),
|
||||
('new', 'नेपाल भाषा', 'Newar'),
|
||||
('nl', 'Nederlands', 'Dutch'),
|
||||
('nn', 'norsk nynorsk', 'Norwegian (Nynorsk)'),
|
||||
('no', 'norsk bokmål', 'Norwegian (Bokmål)'),
|
||||
('oc', 'occitan', 'Occitan'),
|
||||
('or', 'ଓଡ଼ିଆ', 'Oriya'),
|
||||
('os', 'Ирон', 'Ossetian'),
|
||||
('pa', 'ਪੰਜਾਬੀ', 'Eastern Punjabi'),
|
||||
('pl', 'polski', 'Polish'),
|
||||
('pms', 'Piemontèis', 'Piedmontese'),
|
||||
('pnb', 'پنجابی', 'Western Punjabi'),
|
||||
('pt', 'português', 'Portuguese'),
|
||||
('qu', 'Runa Simi', 'Quechua'),
|
||||
('ro', 'română', 'Romanian'),
|
||||
('ru', 'русский', 'Russian'),
|
||||
('sa', 'संस्कृतम्', 'Sanskrit'),
|
||||
('sah', 'саха тыла', 'Sakha'),
|
||||
('scn', 'sicilianu', 'Sicilian'),
|
||||
('sco', 'Scots', 'Scots'),
|
||||
('sh', 'srpskohrvatski / српскохрватски', 'Serbo-Croatian'),
|
||||
('si', 'සිංහල', 'Sinhalese'),
|
||||
('simple', 'Simple English', 'Simple English'),
|
||||
('sk', 'slovenčina', 'Slovak'),
|
||||
('sl', 'slovenščina', 'Slovenian'),
|
||||
('sq', 'shqip', 'Albanian'),
|
||||
('sr', 'српски / srpski', 'Serbian'),
|
||||
('su', 'Basa Sunda', 'Sundanese'),
|
||||
('sv', 'svenska', 'Swedish'),
|
||||
('sw', 'Kiswahili', 'Swahili'),
|
||||
('ta', 'தமிழ்', 'Tamil'),
|
||||
('te', 'తెలుగు', 'Telugu'),
|
||||
('tg', 'тоҷикӣ', 'Tajik'),
|
||||
('th', 'ไทย', 'Thai'),
|
||||
('tl', 'Tagalog', 'Tagalog'),
|
||||
('tr', 'Türkçe', 'Turkish'),
|
||||
('tt', 'татарча/tatarça', 'Tatar'),
|
||||
('uk', 'українська', 'Ukrainian'),
|
||||
('ur', 'اردو', 'Urdu'),
|
||||
('uz', 'oʻzbekcha/ўзбекча', 'Uzbek'),
|
||||
('vec', 'vèneto', 'Venetian'),
|
||||
('vi', 'Tiếng Việt', 'Vietnamese'),
|
||||
('vo', 'Volapük', 'Volapük'),
|
||||
('wa', 'walon', 'Walloon'),
|
||||
('war', 'Winaray', 'Waray'),
|
||||
('yi', 'ייִדיש', 'Yiddish'),
|
||||
('yo', 'Yorùbá', 'Yoruba'),
|
||||
('zh', '中文', 'Chinese'),
|
||||
('zh-min-nan', 'Bân-lâm-gú', 'Min Nan'),
|
||||
('zh-yue', '粵語', 'Cantonese'),
|
||||
]
|
||||
|
||||
def get_langs() -> list[dict[str, str]]:
|
||||
"""List of all known languages."""
|
||||
return [dict(zip(('code', 'local', 'english'), l)) for l in langs]
|
||||
|
||||
def get_current_language() -> str:
|
||||
"""Return ISO-3166 language code for the current language."""
|
||||
return session.get('current_lang', 'en') if has_request_context() else 'en'
|
381
add_links/match.py
Normal file
381
add_links/match.py
Normal file
|
@ -0,0 +1,381 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
import typing
|
||||
|
||||
from .api import MissingPage, call_get_diff, get_wiki_info
|
||||
from .core import get_case_from_content, get_content_and_timestamp, get_revision_info
|
||||
from .util import is_title_case, lc_alpha
|
||||
|
||||
re_link_in_text = re.compile(r"\[\[[^]]+?\]\]", re.I | re.S)
|
||||
|
||||
|
||||
class LinkReplace(Exception):
|
||||
pass
|
||||
|
||||
|
||||
en_dash = "\u2013"
|
||||
trans = {",": ",?", " ": " *[-\n]? *"}
|
||||
trans[en_dash] = trans[" "]
|
||||
|
||||
trans2 = {" ": r"('?s?\]\])?'?s? ?(\[\[(?:.+\|)?)?", "-": "[- ]"}
|
||||
trans2[en_dash] = trans2[" "]
|
||||
|
||||
patterns = [
|
||||
lambda q: re.compile(
|
||||
r"(?<!-)(?:\[\[(?:[^]]+\|)?)?(%s)%s(?:\]\])?"
|
||||
% (
|
||||
re.escape(q[0]),
|
||||
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
|
||||
),
|
||||
re.I,
|
||||
),
|
||||
lambda q: re.compile(
|
||||
r"(?<!-)\[\[[^|]+\|(%s)%s\]\]" % (re.escape(q[0]), re.escape(q[1:])), re.I
|
||||
),
|
||||
lambda q: re.compile(
|
||||
r"(?<!-)\[\[[^|]+\|(%s)%s(?:\]\])?"
|
||||
% (
|
||||
re.escape(q[0]),
|
||||
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
|
||||
),
|
||||
re.I,
|
||||
),
|
||||
lambda q: re.compile(r"(?<!-)(%s)%s" % (re.escape(q[0]), re.escape(q[1:])), re.I),
|
||||
lambda q: re.compile(
|
||||
r"(?<!-)(%s)%s"
|
||||
% (
|
||||
re.escape(q[0]),
|
||||
"".join((trans[c] if c in trans else re.escape(c)) for c in q[1:]),
|
||||
),
|
||||
re.I,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class NoMatch(Exception):
|
||||
pass
|
||||
|
||||
|
||||
re_cite = re.compile(
|
||||
r"<ref( [^>]*?)?>\s*({{cite.*?}}|\[https?://[^]]*?\])\s*</ref>", re.I | re.S
|
||||
)
|
||||
|
||||
|
||||
def parse_cite(text: str) -> typing.Iterator[tuple[str, str]]:
|
||||
"""Parse a citation template."""
|
||||
prev = 0
|
||||
for m in re_cite.finditer(text):
|
||||
yield ("text", text[prev : m.start()])
|
||||
yield ("cite", m.group(0))
|
||||
prev = m.end()
|
||||
yield ("text", text[prev:])
|
||||
|
||||
|
||||
re_heading = re.compile(r"^\s*(=+)\s*(.+)\s*\1(<!--.*-->|\s)*$")
|
||||
|
||||
|
||||
def section_iter(text: str) -> typing.Iterator[tuple[str | None, str]]:
|
||||
"""Iterate sections yielding tuples of heading and section text."""
|
||||
cur_section = ""
|
||||
heading = None
|
||||
in_comment = False
|
||||
for line in text.splitlines(True):
|
||||
if "<!--" in line:
|
||||
in_comment = True
|
||||
if "-->" in line:
|
||||
in_comment = False
|
||||
m = re_heading.match(line)
|
||||
if in_comment or not m:
|
||||
cur_section += line
|
||||
continue
|
||||
if cur_section or heading:
|
||||
yield (heading, cur_section)
|
||||
heading = m.group()
|
||||
cur_section = ""
|
||||
continue
|
||||
yield (heading, cur_section)
|
||||
|
||||
|
||||
def get_subsections(text: str, section_num: int) -> str:
|
||||
"retrieve the text of subsections for a given section number within an article"
|
||||
found = ""
|
||||
collection_level = None
|
||||
for num, (heading, body) in enumerate(section_iter(text)):
|
||||
if heading is None:
|
||||
level = 0
|
||||
else:
|
||||
m = re_heading.match(heading)
|
||||
assert m
|
||||
level = len(m.group(1))
|
||||
if num == section_num:
|
||||
collection_level = level
|
||||
continue
|
||||
if collection_level:
|
||||
if level > collection_level:
|
||||
assert heading
|
||||
found += heading + body
|
||||
else:
|
||||
break
|
||||
return found
|
||||
|
||||
|
||||
def match_found(m, q, linkto):
|
||||
if q[1:] == m.group(0)[1:]:
|
||||
replacement = m.group(1) + q[1:]
|
||||
elif any(c.isupper() for c in q[1:]) or m.group(0) == m.group(0).upper():
|
||||
replacement = q
|
||||
elif is_title_case(m.group(0)):
|
||||
replacement = None
|
||||
replacement = get_case_from_content(q)
|
||||
if replacement is None:
|
||||
replacement = q.lower()
|
||||
else:
|
||||
replacement = m.group(1) + q[1:]
|
||||
assert replacement
|
||||
if linkto:
|
||||
if linkto[0].isupper() and replacement[0] == linkto[0].lower():
|
||||
linkto = linkto[0].lower() + linkto[1:]
|
||||
elif replacement[0].isupper():
|
||||
linkto = linkto[0].upper() + linkto[1:]
|
||||
replacement = linkto + "|" + replacement
|
||||
return replacement
|
||||
|
||||
|
||||
def parse_links(text: str) -> typing.Iterator[tuple[str, str]]:
|
||||
prev = 0
|
||||
for m in re_link_in_text.finditer(text):
|
||||
if prev != m.start():
|
||||
yield ("text", text[prev : m.start()])
|
||||
if any(
|
||||
m.group().lower().startswith("[[" + prefix)
|
||||
for prefix in ("file:", "image:")
|
||||
):
|
||||
yield ("image", m.group(0))
|
||||
else:
|
||||
yield ("link", m.group(0))
|
||||
prev = m.end()
|
||||
if prev < len(text):
|
||||
yield ("text", text[prev:])
|
||||
|
||||
|
||||
def mk_link_matcher(q):
|
||||
re_links = [p(q) for p in patterns]
|
||||
|
||||
def search_for_link(text):
|
||||
for re_link in re_links:
|
||||
m = re_link.search(text)
|
||||
if m and m.group(0).count("[[") < 4:
|
||||
return m
|
||||
|
||||
return search_for_link
|
||||
|
||||
|
||||
def add_link(m, replacement, text):
|
||||
return m.re.sub(lambda m: "[[%s]]" % replacement, text, count=1)
|
||||
|
||||
|
||||
def find_link_in_chunk(q, content, linkto=None):
|
||||
search_for_link = mk_link_matcher(q)
|
||||
new_content = ""
|
||||
replacement = None
|
||||
|
||||
match_in_non_link = False
|
||||
bad_link_match = False
|
||||
found_text_to_link = None
|
||||
|
||||
for token_type, text in parse_links(content):
|
||||
if token_type == "text":
|
||||
if search_for_link(text):
|
||||
match_in_non_link = True
|
||||
elif token_type == "image":
|
||||
before, sep, link_text = text[:-2].rpartition("|")
|
||||
m = search_for_link(link_text)
|
||||
if m:
|
||||
found_text_to_link = m.group(0)
|
||||
replacement = match_found(m, q, linkto)
|
||||
text = before + sep + add_link(m, replacement, link_text) + "]]"
|
||||
elif token_type == "link" and not replacement and not match_in_non_link:
|
||||
link_text = text[2:-2]
|
||||
link_dest = None
|
||||
if "|" in link_text:
|
||||
link_dest, link_text = link_text.split("|", 1)
|
||||
m = search_for_link(link_text)
|
||||
if m and (not link_dest or not link_dest.startswith("#")):
|
||||
lc_alpha_q = lc_alpha(q)
|
||||
|
||||
bad_link_match = (
|
||||
link_dest
|
||||
and len(link_dest) > len(q)
|
||||
and (lc_alpha_q not in lc_alpha(link_dest))
|
||||
)
|
||||
if not link_dest:
|
||||
if q in link_text and len(link_text) > len(q):
|
||||
bad_link_match = True
|
||||
if bad_link_match and link_dest:
|
||||
try:
|
||||
link_dest_redirect = get_wiki_info(link_dest)
|
||||
except MissingPage:
|
||||
link_dest_redirect = None
|
||||
if (
|
||||
link_dest_redirect
|
||||
and lc_alpha(link_dest_redirect) == lc_alpha_q
|
||||
):
|
||||
bad_link_match = False
|
||||
if not bad_link_match:
|
||||
replacement = match_found(m, q, linkto)
|
||||
found_text_to_link = m.group(0)
|
||||
text = add_link(m, replacement, link_text)
|
||||
new_content += text
|
||||
if not replacement:
|
||||
if bad_link_match:
|
||||
raise LinkReplace
|
||||
m = search_for_link(content)
|
||||
if m:
|
||||
found_text_to_link = m.group(0)
|
||||
replacement = match_found(m, q, linkto)
|
||||
new_content = add_link(m, replacement, content)
|
||||
if linkto:
|
||||
m_end = m.end()
|
||||
re_extend = re.compile(m.re.pattern + r"\w*\b", re.I)
|
||||
m = re_extend.search(content)
|
||||
if m and m.end() > m_end:
|
||||
replacement += content[m_end : m.end()]
|
||||
new_content = add_link(m, replacement, content)
|
||||
return (new_content, replacement, found_text_to_link)
|
||||
|
||||
|
||||
def find_link_in_text(q, content):
|
||||
(new_content, replacement) = find_link_in_chunk(q, content)
|
||||
if replacement:
|
||||
return (new_content, replacement)
|
||||
raise NoMatch
|
||||
|
||||
|
||||
def find_link_in_content(q, content, linkto=None):
|
||||
if linkto:
|
||||
try:
|
||||
return find_link_in_content(linkto, content)
|
||||
except NoMatch:
|
||||
pass
|
||||
replacement = None
|
||||
new_content = ""
|
||||
link_replace = False
|
||||
for header, section_text in section_iter(content):
|
||||
if header:
|
||||
new_content += header
|
||||
for token_type, text in parse_cite(section_text):
|
||||
if token_type == "text" and not replacement:
|
||||
try:
|
||||
(new_text, replacement, replaced_text) = find_link_in_chunk(
|
||||
q, text, linkto=linkto
|
||||
)
|
||||
except LinkReplace:
|
||||
link_replace = True
|
||||
if replacement:
|
||||
text = new_text
|
||||
new_content += text
|
||||
if replacement:
|
||||
return (new_content, replacement, replaced_text)
|
||||
raise LinkReplace if link_replace else NoMatch
|
||||
|
||||
|
||||
def find_link_and_section(q, content, linkto=None):
|
||||
if linkto:
|
||||
try:
|
||||
return find_link_and_section(linkto, content)
|
||||
except NoMatch:
|
||||
pass
|
||||
sections = list(section_iter(content))
|
||||
replacement = None
|
||||
|
||||
search_for_link = mk_link_matcher(q)
|
||||
|
||||
found: dict[str, str | int] = {}
|
||||
|
||||
for section_num, (header, section_text) in enumerate(sections):
|
||||
new_content = ""
|
||||
if header:
|
||||
new_content += header
|
||||
for token_type, text in parse_cite(section_text):
|
||||
if token_type == "text" and not replacement:
|
||||
new_text = ""
|
||||
for token_type2, text2 in parse_links(text):
|
||||
if token_type2 == "link" and not replacement:
|
||||
link_text = text2[2:-2]
|
||||
if "|" in link_text:
|
||||
link_dest, link_text = link_text.split("|", 1)
|
||||
else:
|
||||
link_dest = None
|
||||
m = search_for_link(link_text)
|
||||
if m:
|
||||
if link_dest:
|
||||
found["link_dest"] = link_dest
|
||||
found["link_text"] = link_text
|
||||
replacement = match_found(m, q, None)
|
||||
text2 = add_link(m, replacement, link_text)
|
||||
new_text += text2
|
||||
if replacement:
|
||||
text = new_text
|
||||
else:
|
||||
m = search_for_link(text)
|
||||
if m:
|
||||
replacement = match_found(m, q, linkto)
|
||||
text = add_link(m, replacement, text)
|
||||
new_content += text
|
||||
if replacement:
|
||||
found.update(
|
||||
{
|
||||
"section_num": section_num,
|
||||
"section_text": new_content,
|
||||
"old_text": (header or "") + section_text,
|
||||
"replacement": replacement,
|
||||
}
|
||||
)
|
||||
return found
|
||||
raise NoMatch
|
||||
|
||||
|
||||
def find_refs(text: str) -> list[str]:
|
||||
"""Find <ref> in wikitext."""
|
||||
|
||||
refs = re.findall("<ref(?:[^>]*)>(.+?)</ref>", text)
|
||||
print(refs)
|
||||
return refs
|
||||
|
||||
|
||||
def new_link_is_in_ref(replacement: str, text: str) -> bool:
|
||||
"""Is the new link in a <ref>."""
|
||||
link = f"[[{replacement}]]"
|
||||
return any(link in ref for ref in find_refs(text))
|
||||
|
||||
|
||||
def get_match(q: str, title: str, linkto: str | None) -> dict[str, typing.Any]:
|
||||
"""Get match."""
|
||||
rev = get_revision_info(title)
|
||||
|
||||
found: dict[str, typing.Any] = find_link_and_section(q, rev["content"], linkto)
|
||||
|
||||
assert not new_link_is_in_ref(found["replacement"], found["section_text"])
|
||||
|
||||
found["revid"] = rev["revid"]
|
||||
found["pageid"] = rev["pageid"]
|
||||
found["section_text"] += get_subsections(rev["content"], found["section_num"])
|
||||
|
||||
return found
|
||||
|
||||
|
||||
def get_diff(q: str, title: str, linkto: str | None) -> dict[str, typing.Any]:
|
||||
"""Get diff."""
|
||||
content, timestamp = get_content_and_timestamp(title)
|
||||
found: dict[str, typing.Any] = find_link_and_section(q, content, linkto)
|
||||
|
||||
if new_link_is_in_ref(found["replacement"], found["section_text"]):
|
||||
raise NoMatch
|
||||
|
||||
section_text = found["section_text"] + get_subsections(
|
||||
content, found["section_num"]
|
||||
)
|
||||
|
||||
found["diff"] = call_get_diff(title, found["section_num"], section_text)
|
||||
return found
|
101
add_links/mediawiki_api.py
Normal file
101
add_links/mediawiki_api.py
Normal file
|
@ -0,0 +1,101 @@
|
|||
"""Interface with the mediawiki API."""
|
||||
|
||||
import typing
|
||||
from pprint import pprint
|
||||
from typing import Any, cast
|
||||
|
||||
from . import wikidata_oauth
|
||||
|
||||
wiki_hostname = "en.wikipedia.org"
|
||||
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
|
||||
user_agent = "add-links/0.1"
|
||||
|
||||
|
||||
def parse_page(enwiki: str) -> dict[str, Any]:
|
||||
"""Call mediawiki parse API for given article."""
|
||||
params: dict[str, str | int] = {
|
||||
"action": "parse",
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"disableeditsection": 1,
|
||||
"page": enwiki,
|
||||
"prop": "text|links|headhtml",
|
||||
"disabletoc": 1,
|
||||
}
|
||||
|
||||
parse: dict[str, Any] = call(params)["parse"]
|
||||
return parse
|
||||
|
||||
|
||||
def call(params: dict[str, str | int]) -> dict[str, typing.Any]:
|
||||
"""Make GET request to mediawiki API."""
|
||||
data = wikidata_oauth.api_post_request(params)
|
||||
return cast(dict[str, Any], data.json())
|
||||
|
||||
|
||||
def article_exists(title: str) -> bool:
|
||||
"""Get article text."""
|
||||
params: dict[str, str | int] = {
|
||||
"action": "query",
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"titles": title,
|
||||
}
|
||||
return not call(params)["query"]["pages"][0].get("missing")
|
||||
|
||||
|
||||
def get_content(title: str) -> tuple[str, int]:
|
||||
"""Get article text."""
|
||||
params: dict[str, str | int] = {
|
||||
"action": "query",
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"prop": "revisions|info",
|
||||
"rvprop": "content|timestamp|ids",
|
||||
"titles": title,
|
||||
}
|
||||
data = call(params)
|
||||
rev = data["query"]["pages"][0]["revisions"][0]
|
||||
content: str = rev["content"]
|
||||
revid: int = int(rev["revid"])
|
||||
return content, revid
|
||||
|
||||
|
||||
def compare(title: str, new_text: str) -> str:
|
||||
"""Generate a diff for the new article text."""
|
||||
params: dict[str, str | int] = {
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"action": "compare",
|
||||
"fromtitle": title,
|
||||
"toslots": "main",
|
||||
"totext-main": new_text,
|
||||
"prop": "diff",
|
||||
}
|
||||
diff: str = call(params)["compare"]["body"]
|
||||
return diff
|
||||
|
||||
|
||||
def edit_page(
|
||||
pageid: int, section: str | int, text: str, summary: str, baserevid: str, token: str
|
||||
) -> str:
|
||||
"""Edit a page on Wikipedia."""
|
||||
params: dict[str, str | int] = {
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"action": "edit",
|
||||
"pageid": pageid,
|
||||
"text": text,
|
||||
"baserevid": baserevid,
|
||||
"token": token,
|
||||
"nocreate": 1,
|
||||
"summary": summary,
|
||||
"section": section,
|
||||
}
|
||||
ret = call(params)
|
||||
if "edit" not in ret:
|
||||
print("params")
|
||||
pprint(params)
|
||||
print()
|
||||
pprint(ret)
|
||||
return typing.cast(str, ret["edit"])
|
48
add_links/mediawiki_api_old.py
Normal file
48
add_links/mediawiki_api_old.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
"""Interface with the mediawiki API."""
|
||||
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
|
||||
wiki_hostname = "en.wikipedia.org"
|
||||
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
|
||||
user_agent = "dab-mechanic/0.1"
|
||||
|
||||
|
||||
def parse_page(enwiki: str) -> dict[str, Any]:
|
||||
"""Call mediawiki parse API for given article."""
|
||||
params: dict[str, str | int] = {
|
||||
"action": "parse",
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"disableeditsection": 1,
|
||||
"page": enwiki,
|
||||
"prop": "text|links|headhtml",
|
||||
"disabletoc": 1,
|
||||
}
|
||||
|
||||
parse: dict[str, Any] = get(params)["parse"]
|
||||
return parse
|
||||
|
||||
|
||||
def get(params: dict[str, str | int]) -> dict[str, Any]:
|
||||
"""Make GET request to mediawiki API."""
|
||||
data: dict[str, Any] = requests.get(
|
||||
wiki_api_php, headers={"User-Agent": user_agent}, params=params
|
||||
).json()
|
||||
return data
|
||||
|
||||
|
||||
def get_content(title: str) -> str:
|
||||
"""Get article text."""
|
||||
params: dict[str, str | int] = {
|
||||
"action": "query",
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"prop": "revisions|info",
|
||||
"rvprop": "content|timestamp",
|
||||
"titles": title,
|
||||
}
|
||||
data = get(params)
|
||||
rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
|
||||
return rev
|
115
add_links/util.py
Normal file
115
add_links/util.py
Normal file
|
@ -0,0 +1,115 @@
|
|||
"""Util functions."""
|
||||
|
||||
import re
|
||||
import urllib
|
||||
from typing import Any
|
||||
|
||||
# util functions that don't access the network
|
||||
|
||||
namespaces = {
|
||||
ns.casefold()
|
||||
for ns in (
|
||||
"Special",
|
||||
"Media",
|
||||
"Talk",
|
||||
"Template",
|
||||
"Portal",
|
||||
"Portal talk",
|
||||
"Book",
|
||||
"Book talk",
|
||||
"Template talk",
|
||||
"Draft",
|
||||
"Draft talk",
|
||||
"Help",
|
||||
"Help talk",
|
||||
"Category",
|
||||
"Category talk",
|
||||
"User",
|
||||
"Gadget",
|
||||
"Gadget talk",
|
||||
"Gadget definition",
|
||||
"Gadget definition talk",
|
||||
"Topic",
|
||||
"User talk",
|
||||
"Wikipedia",
|
||||
"Education Program",
|
||||
"Education Program talk",
|
||||
"Wikipedia talk",
|
||||
"File",
|
||||
"File talk",
|
||||
"TimedText",
|
||||
"TimedText talk",
|
||||
"MediaWiki",
|
||||
"Module",
|
||||
"Module talk",
|
||||
"MediaWiki talk",
|
||||
)
|
||||
}
|
||||
|
||||
re_space_or_dash = re.compile("[ -]")
|
||||
|
||||
|
||||
def is_title_case(phrase: str) -> bool:
|
||||
"""Is a given phrase is in Title Case."""
|
||||
return all(
|
||||
term[0].isupper() and term[1:].islower()
|
||||
for term in re_space_or_dash.split(phrase)
|
||||
if term and term[0].isalpha()
|
||||
)
|
||||
|
||||
|
||||
def urlquote(value: str) -> str:
|
||||
"""Prepare string for use in URL param."""
|
||||
return urllib.parse.quote_plus(value.encode("utf-8"))
|
||||
|
||||
|
||||
def strip_parens(q: str) -> str:
|
||||
"""Remove a word in parenthesis from the end of a string."""
|
||||
m = re.search(r" \(.*?\)$", q)
|
||||
return q[: m.start()] if m else q
|
||||
|
||||
|
||||
def starts_with_namespace(title: str) -> bool:
|
||||
"""Check if a title starts with a namespace."""
|
||||
return ":" in title and title.split(":", 1)[0].casefold() in namespaces
|
||||
|
||||
|
||||
def is_disambig(doc: dict[str, Any]) -> bool:
|
||||
"""Is a this a disambiguation page."""
|
||||
return any(
|
||||
"disambig" in t
|
||||
or t.endswith("dis")
|
||||
or "given name" in t
|
||||
or t == "template:surname"
|
||||
for t in (t["title"].lower() for t in doc.get("templates", []))
|
||||
)
|
||||
|
||||
|
||||
def norm(s: str) -> str:
|
||||
"""Normalise string."""
|
||||
s = re.sub(r"\W", "", s).lower()
|
||||
return s[:-1] if s and s[-1] == "s" else s
|
||||
|
||||
|
||||
def case_flip(s: str) -> str:
|
||||
"""Switch case of character."""
|
||||
if s.islower():
|
||||
return s.upper()
|
||||
if s.isupper():
|
||||
return s.lower()
|
||||
return s
|
||||
|
||||
|
||||
def case_flip_first(s: str) -> str:
|
||||
"""Switch case of first character in string."""
|
||||
return case_flip(s[0]) + s[1:]
|
||||
|
||||
|
||||
def lc_alpha(s: str) -> str:
|
||||
"""Lower case alphabetic characters in string."""
|
||||
return "".join(c.lower() for c in s if c.isalpha())
|
||||
|
||||
|
||||
def wiki_space_norm(s: str) -> str:
|
||||
"""Normalise article title."""
|
||||
return s.replace("_", " ").strip()
|
98
add_links/wikidata_oauth.py
Normal file
98
add_links/wikidata_oauth.py
Normal file
|
@ -0,0 +1,98 @@
|
|||
import typing
|
||||
import urllib
|
||||
from typing import cast
|
||||
|
||||
from flask import current_app, session
|
||||
from requests_oauthlib import OAuth1Session
|
||||
|
||||
wiki_hostname = "en.wikipedia.org"
|
||||
api_url = f"https://{wiki_hostname}/w/api.php"
|
||||
|
||||
|
||||
def get_edit_proxy() -> dict[str, str]:
|
||||
"""Retrieve proxy information from config."""
|
||||
edit_proxy = current_app.config.get("EDIT_PROXY")
|
||||
if edit_proxy:
|
||||
return {"http": edit_proxy, "https": edit_proxy}
|
||||
else:
|
||||
return {}
|
||||
|
||||
|
||||
def api_post_request(params: dict[str, str | int]):
|
||||
"""HTTP Post using Oauth."""
|
||||
app = current_app
|
||||
# url = "https://www.wikidata.org/w/api.php"
|
||||
client_key = app.config["CLIENT_KEY"]
|
||||
client_secret = app.config["CLIENT_SECRET"]
|
||||
oauth = OAuth1Session(
|
||||
client_key,
|
||||
client_secret=client_secret,
|
||||
resource_owner_key=session["owner_key"],
|
||||
resource_owner_secret=session["owner_secret"],
|
||||
)
|
||||
proxies = get_edit_proxy()
|
||||
return oauth.post(api_url, data=params, timeout=4, proxies=proxies)
|
||||
|
||||
|
||||
def raw_request(params: typing.Mapping[str, str | int]):
|
||||
"""Low-level API request."""
|
||||
app = current_app
|
||||
# url = "https://www.wikidata.org/w/api.php?" + urlencode(params)
|
||||
client_key = app.config |