429 lines
13 KiB
Python
Executable file
429 lines
13 KiB
Python
Executable file
#!/usr/bin/python3
|
|
|
|
import html
|
|
import itertools
|
|
import json
|
|
import re
|
|
import typing
|
|
|
|
import flask
|
|
import werkzeug
|
|
from requests_oauthlib import OAuth1Session
|
|
from werkzeug.wrappers.response import Response
|
|
|
|
from add_links import api, core, mediawiki_api, mediawiki_oauth
|
|
from add_links.match import NoMatch, get_diff, get_match
|
|
|
|
app = flask.Flask(__name__)
|
|
app.config.from_object("config.default")
|
|
app.debug = True
|
|
|
|
wiki_hostname = "en.wikipedia.org"
|
|
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
|
|
wiki_index_php = f"https://{wiki_hostname}/w/index.php"
|
|
|
|
|
|
class Hit(typing.TypedDict):
|
|
"""Candidate articles."""
|
|
|
|
ns: int
|
|
title: str
|
|
pageid: int
|
|
size: int
|
|
wordcount: int
|
|
snippet: str
|
|
timestamp: str
|
|
|
|
|
|
def load_examples() -> list[dict[str, str | int]]:
|
|
"""Load examples."""
|
|
return [json.loads(line) for line in open("examples")]
|
|
|
|
|
|
def article_title_to_search_query(title: str) -> str:
|
|
"""
|
|
Convert a Wikipedia article title to a search query string.
|
|
|
|
This function takes a Wikipedia article title and parses it to create a
|
|
search query. If the title contains disambiguation text in parentheses,
|
|
it separates the main title and the disambiguation text and formats them
|
|
into a search query using an AND operator. If there's no disambiguation text,
|
|
the title is used as is for the search query.
|
|
|
|
The search query is formatted such that the main title and the disambiguation
|
|
text (if present) are enclosed in double quotes and connected with 'AND'.
|
|
This format is useful for precise search engine queries.
|
|
|
|
Args:
|
|
title (str): The Wikipedia article title, possibly including disambiguation
|
|
text in parentheses.
|
|
|
|
Returns:
|
|
str: A formatted search query string. If disambiguation text is present,
|
|
returns '"[main title]" AND "[disambiguation text]"'. Otherwise,
|
|
returns '"[title]"'.
|
|
|
|
Example:
|
|
>>> article_title_to_search_query("Python (programming language)")
|
|
'"Python" AND "programming language"'
|
|
>>> article_title_to_search_query("London")
|
|
'"London"'
|
|
"""
|
|
m = re.match(r"^(.*) \((.*)\)$", title)
|
|
return f'"{m.group(1)}" AND "{m.group(2)}"' if m else f'"{title}"'
|
|
|
|
|
|
def run_search(q: str, limit: int | str = "max") -> dict[str, typing.Any]:
|
|
"""Search Wikipedia."""
|
|
params = {"list": "search", "srwhat": "text", "srlimit": limit, "srsearch": q}
|
|
return typing.cast(dict[str, typing.Any], api.api_get(params)["query"])
|
|
|
|
|
|
def article_url(title: str) -> str:
|
|
"""URL for search page."""
|
|
return flask.url_for("article_page", url_title=title.replace(" ", "_"))
|
|
|
|
|
|
def get_hit_count(q: str) -> int:
|
|
"""Search Wikipedia and return hit count."""
|
|
return typing.cast(int, run_search(q, limit=0)["searchinfo"]["totalhits"])
|
|
|
|
|
|
def search_count(q: str) -> int:
|
|
"""How often does this article title appear in Wikipedia."""
|
|
return get_hit_count(article_title_to_search_query(q)) - 1
|
|
|
|
|
|
def search_count_with_link(q: str) -> int:
|
|
"""Articles in Wikipedia that include this search term and a link."""
|
|
return get_hit_count(article_title_to_search_query(q) + f' linksto:"{q}"')
|
|
|
|
|
|
def search_no_link(q: str) -> tuple[int, list[Hit]]:
|
|
"""Search for mentions of article title with no link included."""
|
|
query = run_search(article_title_to_search_query(q) + f' -linksto:"{q}"', "max")
|
|
return (query["searchinfo"]["totalhits"], query["search"])
|
|
|
|
|
|
@app.before_request
|
|
def global_user() -> None:
|
|
"""Make username available everywhere."""
|
|
flask.g.user = mediawiki_oauth.get_username()
|
|
|
|
|
|
@app.route("/")
|
|
def index() -> str | Response:
|
|
"""Index page."""
|
|
if "oauth_verifier" in flask.request.args and "oauth_token" in flask.request.args:
|
|
url = flask.url_for("oauth_callback", **flask.request.args) # type: ignore
|
|
return flask.redirect(url)
|
|
|
|
examples = load_examples()
|
|
examples.sort(
|
|
key=lambda i: float(i["with_links"]) / float(i["total"]), reverse=True
|
|
)
|
|
|
|
if q := flask.request.args.get("q"):
|
|
if q_trimmed := q.strip():
|
|
return flask.redirect(article_url(q_trimmed))
|
|
|
|
return flask.render_template(
|
|
"index.html", examples=examples, article_url=article_url
|
|
)
|
|
|
|
|
|
def case_flip(s: str) -> str:
|
|
"""
|
|
Switch the case of a single character.
|
|
|
|
If the character is lowercase, it is converted to uppercase. If it is uppercase,
|
|
it is converted to lowercase. Non-alphabetic characters remain unchanged.
|
|
|
|
Args:
|
|
s (str): A single character string.
|
|
|
|
Returns:
|
|
str: The character with its case flipped, or the original character if it's
|
|
not a letter.
|
|
|
|
Example:
|
|
>>> case_flip('a')
|
|
'A'
|
|
>>> case_flip('A')
|
|
'a'
|
|
>>> case_flip('1')
|
|
'1'
|
|
"""
|
|
if s.islower():
|
|
return s.upper()
|
|
if s.isupper():
|
|
return s.lower()
|
|
return s
|
|
|
|
|
|
def case_flip_first(s: str) -> str:
|
|
"""Switch case of first character in string."""
|
|
return case_flip(s[0]) + s[1:]
|
|
|
|
|
|
def tidy_snippet(snippet: str) -> str:
|
|
"""Remove HTML from snippet."""
|
|
snippet = snippet.replace("\u2013", "-")
|
|
snippet = snippet.replace("</span>", "")
|
|
snippet = snippet.replace('<span class="searchmatch">', "")
|
|
return html.unescape(snippet)
|
|
|
|
|
|
@app.route("/oauth/start")
|
|
def start_oauth() -> Response:
|
|
"""Start OAuth."""
|
|
next_page = flask.request.args.get("next")
|
|
if next_page:
|
|
flask.session["after_login"] = next_page
|
|
|
|
client_key = app.config["CLIENT_KEY"]
|
|
client_secret = app.config["CLIENT_SECRET"]
|
|
request_token_url = wiki_index_php + "?title=Special%3aOAuth%2finitiate"
|
|
|
|
oauth = OAuth1Session(client_key, client_secret=client_secret, callback_uri="oob")
|
|
fetch_response = oauth.fetch_request_token(request_token_url)
|
|
|
|
flask.session["owner_key"] = fetch_response.get("oauth_token")
|
|
flask.session["owner_secret"] = fetch_response.get("oauth_token_secret")
|
|
|
|
assert flask.session["owner_key"] and flask.session["owner_secret"]
|
|
|
|
base_authorization_url = f"https://{wiki_hostname}/wiki/Special:OAuth/authorize"
|
|
authorization_url = oauth.authorization_url(
|
|
base_authorization_url, oauth_consumer_key=client_key
|
|
)
|
|
return flask.redirect(authorization_url)
|
|
|
|
|
|
@app.route("/oauth/callback", methods=["GET"])
|
|
def oauth_callback() -> werkzeug.wrappers.response.Response:
|
|
"""Oauth callback."""
|
|
client_key = app.config["CLIENT_KEY"]
|
|
client_secret = app.config["CLIENT_SECRET"]
|
|
|
|
oauth = OAuth1Session(
|
|
client_key,
|
|
client_secret=client_secret,
|
|
resource_owner_key=flask.session.get("owner_key"),
|
|
resource_owner_secret=flask.session.get("owner_secret"),
|
|
)
|
|
|
|
oauth_response = oauth.parse_authorization_response(flask.request.url)
|
|
verifier = oauth_response.get("oauth_verifier")
|
|
access_token_url = wiki_index_php + "?title=Special%3aOAuth%2ftoken"
|
|
oauth = OAuth1Session(
|
|
client_key,
|
|
client_secret=client_secret,
|
|
resource_owner_key=flask.session["owner_key"],
|
|
resource_owner_secret=flask.session["owner_secret"],
|
|
verifier=verifier,
|
|
)
|
|
|
|
oauth_tokens = oauth.fetch_access_token(access_token_url)
|
|
flask.session["owner_key"] = oauth_tokens.get("oauth_token")
|
|
flask.session["owner_secret"] = oauth_tokens.get("oauth_token_secret")
|
|
|
|
print("login successful")
|
|
|
|
next_page = flask.session.get("after_login")
|
|
return flask.redirect(next_page if next_page else flask.url_for("index"))
|
|
|
|
|
|
@app.route("/oauth/disconnect")
|
|
def oauth_disconnect() -> werkzeug.wrappers.response.Response:
|
|
"""Disconnect OAuth."""
|
|
for key in "owner_key", "owner_secret", "username", "after_login":
|
|
if key in flask.session:
|
|
del flask.session[key]
|
|
return flask.redirect(flask.url_for("index"))
|
|
|
|
|
|
def match_type(q: str, snippet: str) -> str | None:
|
|
"""Discover match type, ''exact', 'case_mismatch' or None.
|
|
|
|
>>> match_type('foo', 'foo')
|
|
'exact'
|
|
>>> match_type('foo', 'bar') is None
|
|
True
|
|
>>> match_type('bar', 'foo bar baz')
|
|
'exact'
|
|
>>> match_type('clean coal technology', 'foo clean coal technologies baz')
|
|
'exact'
|
|
>>> match_type('bar', 'foo Bar baz')
|
|
'exact'
|
|
>>> match_type('bar', 'foo BAR baz')
|
|
'case_mismatch'
|
|
>>> match_type('foo-bar', 'aa foo-bar cc')
|
|
'exact'
|
|
>>> match_type(u'foo\u2013bar', 'aa foo-bar cc')
|
|
'exact'
|
|
"""
|
|
q = q.replace("\u2013", "-")
|
|
snippet = tidy_snippet(snippet)
|
|
|
|
if q in snippet or case_flip_first(q) in snippet:
|
|
return "exact"
|
|
match = None
|
|
if q.lower() in snippet.lower():
|
|
match = "case_mismatch"
|
|
if match != "exact" and q.endswith("y"):
|
|
if q[:-1] in snippet or case_flip_first(q[:-1]) in snippet:
|
|
return "exact"
|
|
elif match is None:
|
|
if q[:-1].lower() in snippet.lower():
|
|
match = "case_mismatch"
|
|
return match
|
|
|
|
|
|
class NoGoodHit(Exception):
|
|
"""No good hit."""
|
|
|
|
|
|
def get_best_hit(title: str, hits: list[Hit]) -> tuple[Hit, dict[str, typing.Any]]:
|
|
"""Find the best hit within the search results."""
|
|
for hit in hits:
|
|
if hit["title"].lower() == title.lower():
|
|
continue
|
|
if match_type(title, hit["snippet"]) != "exact":
|
|
continue
|
|
|
|
try:
|
|
print(f'get diff: {hit["title"]}, {title}')
|
|
found = get_diff(title, hit["title"], None)
|
|
except NoMatch:
|
|
print("no match")
|
|
continue
|
|
|
|
return (hit, found)
|
|
|
|
raise NoGoodHit
|
|
|
|
|
|
@app.route("/link/<path:url_title>", methods=["GET", "POST"])
|
|
def article_page(url_title: str) -> str | Response:
|
|
"""Article page."""
|
|
from_title = url_title.replace("_", " ").strip()
|
|
|
|
if flask.request.method == "POST":
|
|
hit_title = flask.request.form["hit"]
|
|
try:
|
|
do_save(from_title, hit_title)
|
|
except mediawiki_oauth.LoginNeeded:
|
|
return flask.redirect(flask.url_for("start_oauth"))
|
|
return flask.redirect(
|
|
flask.url_for("article_page", url_title=url_title, after=hit_title)
|
|
)
|
|
|
|
article_title = flask.request.args.get("title")
|
|
|
|
total = search_count(from_title)
|
|
with_link = search_count_with_link(from_title)
|
|
|
|
no_link_count, hits = search_no_link(from_title)
|
|
|
|
by_title = {hit["title"]: hit for hit in hits}
|
|
|
|
found = None
|
|
if article_title in by_title:
|
|
hit = by_title[article_title]
|
|
try:
|
|
found = get_diff(from_title, hit["title"], None)
|
|
except NoMatch:
|
|
pass
|
|
|
|
if not found:
|
|
after = flask.request.args.get("after")
|
|
if after:
|
|
print(after)
|
|
hits_iter = itertools.dropwhile(lambda hit: hit["title"] != after, hits)
|
|
skip = next(hits_iter, None)
|
|
if skip:
|
|
hits = list(hits_iter)
|
|
|
|
try:
|
|
hit, found = get_best_hit(from_title, hits)
|
|
except NoGoodHit:
|
|
return flask.render_template("all_done.html")
|
|
|
|
return flask.render_template(
|
|
"article.html",
|
|
title=from_title,
|
|
total=total,
|
|
with_link=with_link,
|
|
hit_title=hit["title"],
|
|
hits=hits,
|
|
replacement=found["replacement"],
|
|
diff=found["diff"],
|
|
found=found,
|
|
url_title=url_title,
|
|
)
|
|
|
|
|
|
def do_save(title: str, hit_title: str) -> str:
|
|
"""Update page on Wikipedia."""
|
|
token = mediawiki_oauth.get_token()
|
|
|
|
found = get_match(title, hit_title, None)
|
|
|
|
summary = (
|
|
f"link [[{found['replacement']}]] using [[:en:User:Edward/Find link|Find link]]"
|
|
)
|
|
|
|
edit = mediawiki_api.edit_page(
|
|
pageid=found["pageid"],
|
|
section=found["section_num"],
|
|
text=found["section_text"],
|
|
summary=summary,
|
|
baserevid=found["revid"],
|
|
token=token,
|
|
)
|
|
|
|
return edit
|
|
|
|
|
|
@app.route("/saved")
|
|
def save_done() -> str:
|
|
"""Save complete."""
|
|
return flask.render_template("save_done.html")
|
|
|
|
|
|
@app.route("/api/1/hits")
|
|
def api_hits() -> werkzeug.wrappers.response.Response:
|
|
"""Return canidates for the given article title."""
|
|
title = flask.request.args.get("title")
|
|
assert title
|
|
ret = core.do_search(title)
|
|
return flask.jsonify(title=title, hits=ret["results"])
|
|
|
|
# mock_hits: list[Hit] = json.load(open("sample.json"))
|
|
# return flask.jsonify(title=title, hits=mock_hits)
|
|
|
|
|
|
@app.route("/api/1/valid_hit")
|
|
def api_valid_hit() -> werkzeug.wrappers.response.Response:
|
|
"""Return canidates for the given article title."""
|
|
link_from = flask.request.args["link_from"]
|
|
link_to = flask.request.args["link_to"]
|
|
|
|
try:
|
|
diff, replacement = get_diff(link_to, link_from, None)
|
|
except NoMatch:
|
|
return flask.jsonify(valid=False)
|
|
|
|
return flask.jsonify(valid=True, diff=diff, replacement=replacement)
|
|
|
|
|
|
@app.route("/favicon.ico")
|
|
def favicon() -> Response:
|
|
"""No favicon."""
|
|
return flask.Response(status=404)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app.run(host="0.0.0.0", port=8000)
|