diff --git a/add_links/api.py b/add_links/api.py index 05fc7b1..6f57def 100644 --- a/add_links/api.py +++ b/add_links/api.py @@ -1,4 +1,5 @@ import re +import sys import typing import requests @@ -72,18 +73,33 @@ webpage_error = ( ) +def _get_active_session() -> requests.sessions.Session: + """Return OAuth session if one is available in Flask context, else plain session.""" + try: + from flask import g + if hasattr(g, "oauth_session") and g.oauth_session is not None: + return g.oauth_session # type: ignore[return-value] + except RuntimeError: + pass + print("WARNING: using unauthenticated session", file=sys.stderr) + return get_session() + + def api_get(params: StrDict) -> StrDict: """Make call to Wikipedia API.""" - s = get_session() + s = _get_active_session() r = s.get(get_query_url(), params=params) try: ret: StrDict = r.json() except JSONDecodeError: + print(f"API request failed: HTTP {r.status_code}", file=sys.stderr) + print(f"Response body: {r.text!r}", file=sys.stderr) if webpage_error in r.text: raise MediawikiError(webpage_error) - else: - raise MediawikiError("unknown error") + if r.status_code == 429: + raise MediawikiError("Wikipedia rate limit exceeded — wait a moment and try again.") + raise MediawikiError(f"HTTP {r.status_code}: {r.text[:200]!r}") check_for_error(ret) return ret @@ -271,7 +287,7 @@ def call_get_diff(title: str, section_num: int, section_text: str) -> str: "rvdifftotext": section_text.strip(), } - s = get_session() + s = _get_active_session() r = s.post(get_query_url(), data=data) try: ret = r.json() diff --git a/add_links/match.py b/add_links/match.py index 9d00404..a3440e7 100644 --- a/add_links/match.py +++ b/add_links/match.py @@ -78,7 +78,7 @@ re_cite = re.compile( re.I | re.S, ) -re_cite_template_start = re.compile(r"\{\{(?:cite|citation|short description|gli|defn|annotated link|excerpt|main|see)\b", re.I) +re_cite_template_start = re.compile(r"\{\{(?:cite|citation|short description|gli|defn|annotated link|excerpt|main|see|for)\b", re.I) re_no_param_template = re.compile(r"\{\{[^|{}]+\}\}") re_external_link = re.compile(r"\[https?://[^\]]+\]") # Italic text (work titles in bibliographies). Handles apostrophes in content @@ -252,6 +252,14 @@ def add_link(m: re.Match[str], replacement: str, text: str) -> str: if matched_text.startswith("[[") and matched_text.endswith("|"): return m.re.sub(lambda m: f"[[{replacement}|", text, count=1) + split_links = matched_text.find("]] [[") + if split_links > 0 and m.start() >= 2 and text[m.start() - 2 : m.start()] == "[[": + # Match starts inside one link and continues into the next opening link. + # Link only the text from the first link span and leave the second link as-is. + link_dest = replacement.split("|")[0] if "|" in replacement else replacement + visible = matched_text[:split_links] + return text[: m.start() - 2] + f"[[{link_dest}|{visible}]]" + text[m.start() + split_links + 2 :] + inner_bracket = matched_text.find("[[") if inner_bracket > 0: prefix = matched_text[:inner_bracket].rstrip() @@ -551,4 +559,6 @@ def get_diff(q: str, title: str, linkto: str | None) -> dict[str, typing.Any]: ) found["diff"] = call_get_diff(title, found["section_num"], section_text) + if not found["diff"]: + raise NoMatch return found diff --git a/add_links/mediawiki_oauth.py b/add_links/mediawiki_oauth.py index 39aecb0..12ebd58 100644 --- a/add_links/mediawiki_oauth.py +++ b/add_links/mediawiki_oauth.py @@ -1,5 +1,6 @@ """Wikipedia OAuth.""" +import sys import typing import urllib from typing import cast @@ -73,9 +74,8 @@ def api_request(params: typing.Mapping[str, str | int]) -> dict[str, typing.Any] try: return cast(dict[str, typing.Any], r.json()) except Exception: - print("text") - print(r.text) - print("---") + print(f"API request failed: HTTP {r.status_code}", file=sys.stderr) + print(f"Response body: {r.text!r}", file=sys.stderr) raise @@ -99,13 +99,40 @@ def userinfo_call() -> typing.Mapping[str, typing.Any]: return api_request(params) +def get_oauth_session() -> OAuth1Session | None: + """Return an OAuth1Session for the current user, or None if not logged in.""" + if "owner_key" not in session or "owner_secret" not in session: + return None + app = current_app + client_key = app.config["CLIENT_KEY"] + client_secret = app.config["CLIENT_SECRET"] + oauth = OAuth1Session( + client_key, + client_secret=client_secret, + resource_owner_key=session["owner_key"], + resource_owner_secret=session["owner_secret"], + ) + oauth.headers.update({"User-Agent": ua}) + oauth.params = typing.cast( + dict[str, str | int], + {"format": "json", "action": "query", "formatversion": 2}, + ) + return oauth + + def get_username() -> None | str: """Get the username or None if not logged in.""" if "owner_key" not in session: return None # not authorized if "username" not in session: - reply = userinfo_call() + try: + reply = userinfo_call() + except Exception as e: + print(f"get_username failed, clearing session: {e}", file=sys.stderr) + session.pop("owner_key", None) + session.pop("owner_secret", None) + return None if "query" not in reply: return None session["username"] = reply["query"]["userinfo"]["name"] diff --git a/static/css/diff.css b/static/css/diff.css index 65d5ef9..c7009f2 100644 --- a/static/css/diff.css +++ b/static/css/diff.css @@ -5,19 +5,16 @@ span.searchmatch { font-weight: bold; } table.diff,td.diff-otitle,td.diff-ntitle{background-color:white} td.diff-otitle,td.diff-ntitle{text-align:center} -td.diff-marker{text-align:right;font-weight:bold;font-size:1.25em} +td.diff-marker{width:1.5em;text-align:center;font-weight:bold;font-size:1.25em;padding:0 0.3em} td.diff-lineno{font-weight:bold} td.diff-addedline,td.diff-deletedline,td.diff-context{font-size:88%;vertical-align:top;white-space:-moz-pre-wrap;white-space:pre-wrap} -td.diff-addedline,td.diff-deletedline{border-style:solid;border-width:1px 1px 1px 4px;border-radius:0.33em} -td.diff-addedline{border-color:#a3d3ff} -td.diff-deletedline{border-color:#ffe49c} -td.diff-context{background:#f3f3f3;color:#333333;border-style:solid;border-width:1px 1px 1px 4px;border-color:#e6e6e6;border-radius:0.33em} +td.diff-addedline,td.diff-deletedline{border-left:3px solid} +td.diff-addedline{border-color:#a3d3ff;background:#f0f8ff} +td.diff-deletedline{border-color:#ffe49c;background:#fffaf0} +td.diff-context{color:#555} .diffchange{font-weight:bold;text-decoration:none} -table.diff{border:none;width:98%;border-spacing:4px; table-layout:fixed} -td.diff-addedline .diffchange,td.diff-deletedline .diffchange{border-radius:0.33em;padding:0.25em 0} +table.diff{border:none;width:100%;border-spacing:0;border-collapse:collapse;table-layout:auto} td.diff-addedline .diffchange{background:#d8ecff} td.diff-deletedline .diffchange{background:#feeec8} -table.diff td{padding:0.33em 0.66em} -table.diff col.diff-marker{width:2%} -table.diff col.diff-content{width:48%} -table.diff td div{ word-wrap:break-word; overflow:auto} +table.diff td{padding:0.2em 0.5em} +table.diff td div{word-wrap:break-word;overflow:auto} diff --git a/static/favicon.svg b/static/favicon.svg new file mode 100644 index 0000000..181c415 --- /dev/null +++ b/static/favicon.svg @@ -0,0 +1,3 @@ + + 🔗 + diff --git a/templates/all_done.html b/templates/all_done.html index c346482..e3135f7 100644 --- a/templates/all_done.html +++ b/templates/all_done.html @@ -1,10 +1,11 @@ {% extends "base.html" %} -{% block title %}Index{% endblock %} +{% block title %}All done{% endblock %} {% block content %} -
-

All done

-
back to index
-
+
+

All done

+

No more candidates found for this article.

+ Search another article +
{% endblock %} diff --git a/templates/article.html b/templates/article.html index 55d05b0..a96770c 100644 --- a/templates/article.html +++ b/templates/article.html @@ -1,48 +1,152 @@ {% extends "base.html" %} -{% block title %}Link '{{ title }}' in '{{ hit_title }}'{% endblock %} +{% block title %}{{ title }}{% endblock %} {% block style %} {% endblock %} {% block content %} -
-

Link '{{ title }}' in '{{ hit_title }}'

-
- - -
+
+ -
Username: {{ g.user }}
+
+

Find links to "{{ title }}"

+ {{ title }} ↗ + {% if redirect_to %} + → redirects to {{ redirect_to }} ↗ + {% endif %} +
-
view article
+
+ {{ total }} mentions total + {{ with_link }} already linked{% if total > 0 %} ({{ "{:.0%}".format(with_link / total) }}){% endif %} + {% if saves_this_session %} + {{ saves_this_session }} added this session + {% endif %} +
-
back to index
+
+
+
+ Searching… +
+ Searching… +
+
-
total: {{ total }}
-
with link: {{ with_link }}
-
ratio: {{ "{:.1%}".format(with_link / total) }}
- {#
hit: {{ hit }}
#} -
replacement: {{ found.replacement }}
-
section: {{ found.section }}
- - {{ diff | safe }} -
-
- -
- - skip + -
    + + + {% if hits %} +
    + {{ hits | length }} candidates +
      {% for hit in hits %} - {% set url = url_for("article_page", url_title=url_title, title=hit.title) %} -
    1. {{ hit.title }} – {{ hit.snippet | safe }}
    2. +
    3. + {{ hit.title }} +
    4. {% endfor %}
    -
+ + {% endif %} +
{% endblock %} +{% block script %} + +{% endblock %} diff --git a/templates/base.html b/templates/base.html index 3804a16..0328311 100644 --- a/templates/base.html +++ b/templates/base.html @@ -2,21 +2,38 @@ - + - - - {% block title %}{% endblock %} - - + + {% block title %}{% endblock %} – Missing Link {% block style %}{% endblock %} + + {% block content %}{% endblock %} - - + {% block script %}{% endblock %} diff --git a/templates/error.html b/templates/error.html new file mode 100644 index 0000000..c1a5018 --- /dev/null +++ b/templates/error.html @@ -0,0 +1,17 @@ +{% extends "base.html" %} + +{% block title %}Error{% endblock %} + +{% block content %} +
+
+
+
+

Something went wrong

+

{{ message }}

+
+ Back to home +
+
+
+{% endblock %} diff --git a/templates/index.html b/templates/index.html index feab172..11c14ec 100644 --- a/templates/index.html +++ b/templates/index.html @@ -1,25 +1,44 @@ {% extends "base.html" %} -{% block title %}Index{% endblock %} +{% block title %}Missing Link{% endblock %} {% block content %} -
-

Index

-
- - -
- -
Username: {{ g.user }}
- - - {% for item in examples %} - - - - - - {% endfor %} -
{{ item.title }}{{ item.total }}{{ "{:.1%}".format(item.with_links / item.total) }}
+
+
+
+

Missing Link

+

Find unlinked mentions of a Wikipedia article and add the links.

+
+ + +
+
+ + {% if debug %} +
+
+

Examples

+ + + + + + + + + + {% for item in examples %} + + + + + + {% endfor %} + +
ArticleTotal% linked
{{ item.title }}{{ item.total }}{{ "{:.0%}".format(item.with_links / item.total) }}
+
+
+ {% endif %} +
{% endblock %} diff --git a/templates/save_done.html b/templates/save_done.html index c96cc66..ffca063 100644 --- a/templates/save_done.html +++ b/templates/save_done.html @@ -1,10 +1,11 @@ {% extends "base.html" %} -{% block title %}Index{% endblock %} +{% block title %}Edit saved{% endblock %} {% block content %} -
-

Save done

-
Save is complete.
-
+
+

Edit saved

+

Your edit has been saved to Wikipedia.

+ Search another article +
{% endblock %} diff --git a/web_view.py b/web_view.py index 32fb1da..6cc669c 100755 --- a/web_view.py +++ b/web_view.py @@ -4,11 +4,13 @@ import html import itertools import json import re +import sys import typing import flask import werkzeug from requests_oauthlib import OAuth1Session +from requests_oauthlib.oauth1_session import TokenRequestDenied from werkzeug.wrappers.response import Response from add_links import api, core, mediawiki_api, mediawiki_oauth @@ -94,14 +96,20 @@ def search_count(q: str) -> int: return get_hit_count(article_title_to_search_query(q)) - 1 -def search_count_with_link(q: str) -> int: +def search_count_with_link(q: str, redirect_to: str | None = None) -> int: """Articles in Wikipedia that include this search term and a link.""" - return get_hit_count(article_title_to_search_query(q) + f' linksto:"{q}"') + count = get_hit_count(article_title_to_search_query(q) + f' linksto:"{q}"') + if redirect_to: + count += get_hit_count(article_title_to_search_query(q) + f' linksto:"{redirect_to}"') + return count -def search_no_link(q: str) -> tuple[int, list[Hit]]: +def search_no_link(q: str, redirect_to: str | None = None) -> tuple[int, list[Hit]]: """Search for mentions of article title with no link included.""" - query = run_search(article_title_to_search_query(q) + f' -linksto:"{q}"', "max") + exclude = f' -linksto:"{q}"' + if redirect_to: + exclude += f' -linksto:"{redirect_to}"' + query = run_search(article_title_to_search_query(q) + exclude, "max") return (query["searchinfo"]["totalhits"], query["search"]) @@ -109,6 +117,7 @@ def search_no_link(q: str) -> tuple[int, list[Hit]]: def global_user() -> None: """Make username available everywhere.""" flask.g.user = mediawiki_oauth.get_username() + flask.g.oauth_session = mediawiki_oauth.get_oauth_session() @app.route("/") @@ -118,17 +127,20 @@ def index() -> str | Response: url = flask.url_for("oauth_callback", **flask.request.args) # type: ignore return flask.redirect(url) - examples = load_examples() - examples.sort( - key=lambda i: float(i["with_links"]) / float(i["total"]), reverse=True - ) - if q := flask.request.args.get("q"): if q_trimmed := q.strip(): return flask.redirect(article_url(q_trimmed)) + debug = flask.request.args.get("debug") + examples: list[dict[str, str | int]] = [] + if debug: + examples = load_examples() + examples.sort( + key=lambda i: float(i["with_links"]) / float(i["total"]), reverse=True + ) + return flask.render_template( - "index.html", examples=examples, article_url=article_url + "index.html", examples=examples, article_url=article_url, debug=debug ) @@ -187,7 +199,12 @@ def start_oauth() -> Response: oauth = OAuth1Session(client_key, client_secret=client_secret, callback_uri="oob") oauth.headers.update({"User-Agent": api.ua}) - fetch_response = oauth.fetch_request_token(request_token_url) + try: + fetch_response = oauth.fetch_request_token(request_token_url) + except TokenRequestDenied as e: + return flask.make_response( + flask.render_template("error.html", message=str(e)), 502 + ) flask.session["owner_key"] = fetch_response.get("oauth_token") flask.session["owner_secret"] = fetch_response.get("oauth_token_secret") @@ -229,7 +246,8 @@ def oauth_callback() -> werkzeug.wrappers.response.Response: flask.session["owner_key"] = oauth_tokens.get("oauth_token") flask.session["owner_secret"] = oauth_tokens.get("oauth_token_secret") - print("login successful") + username = mediawiki_oauth.get_username() + print(f"login successful: {username}", file=sys.stderr) next_page = flask.session.get("after_login") return flask.redirect(next_page if next_page else flask.url_for("index")) @@ -281,31 +299,16 @@ def match_type(q: str, snippet: str) -> str | None: return match -class NoGoodHit(Exception): - """No good hit.""" -def get_best_hit(title: str, hits: list[Hit]) -> tuple[Hit, dict[str, typing.Any]]: - """Find the best hit within the search results.""" - for hit in hits: - if hit["title"].lower() == title.lower(): - continue - # if match_type(title, hit["snippet"]) != "exact": - # continue - - try: - print(f'get diff: {hit["title"]}, {title}') - found = get_diff(title, hit["title"], None) - except NoMatch: - print("no match") - continue - except api.MediawikiError as e: - print(f"MediawikiError for {hit['title']!r}: {e}") - continue - - return (hit, found) - - raise NoGoodHit +def _record_skip(from_title: str, hit_title: str) -> None: + """Record that a candidate was skipped or saved for this article.""" + skipped: dict[str, list[str]] = flask.session.get("skipped", {}) + article_skipped = skipped.get(from_title, []) + if hit_title not in article_skipped: + skipped[from_title] = article_skipped + [hit_title] + flask.session["skipped"] = skipped + flask.session.modified = True def handle_post(url_title: str) -> Response: @@ -316,8 +319,16 @@ def handle_post(url_title: str) -> Response: do_save(from_title, hit_title) except mediawiki_oauth.LoginNeeded: return flask.redirect(flask.url_for("start_oauth")) - except mediawiki_api.APIError as e: - return flask.make_response(f"Save failed: {e}", 502) + except (mediawiki_api.APIError, api.MediawikiError) as e: + return flask.make_response( + flask.render_template("error.html", message=f"Save failed: {e}"), 502 + ) + flask.session["saves"] = flask.session.get("saves", 0) + 1 + saves_by_title: dict[str, int] = flask.session.get("saves_by_title", {}) + saves_by_title[from_title] = saves_by_title.get(from_title, 0) + 1 + flask.session["saves_by_title"] = saves_by_title + flask.session.modified = True + _record_skip(from_title, hit_title) return flask.redirect( flask.url_for("article_page", url_title=url_title, after=hit_title) ) @@ -330,48 +341,55 @@ def article_page(url_title: str) -> str | Response: return handle_post(url_title) from_title = url_title.replace("_", " ").strip() - article_title = flask.request.args.get("title") - total = search_count(from_title) - with_link = search_count_with_link(from_title) + try: + redirect_to = api.get_wiki_info(from_title) + except (api.MissingPage, api.MultipleRedirects, api.MediawikiError): + redirect_to = None - no_link_count, hits = search_no_link(from_title) + try: + total = search_count(from_title) + with_link = search_count_with_link(from_title, redirect_to) + _no_link_count, hits = search_no_link(from_title, redirect_to) + except api.MediawikiError as e: + return flask.make_response( + flask.render_template("error.html", message=str(e)), 502 + ) - by_title = {hit["title"]: hit for hit in hits} + # Filter out candidates already processed this session + session_skipped: set[str] = set( + flask.session.get("skipped", {}).get(from_title, []) + ) - found = None - if article_title in by_title: - hit = by_title[article_title] - try: - found = get_diff(from_title, hit["title"], None) - except NoMatch: - pass + # If a specific candidate was requested, move it to the front + title_param = flask.request.args.get("title") + if title_param: + hits = [h for h in hits if h["title"] == title_param] + \ + [h for h in hits if h["title"] != title_param] - if not found: - after = flask.request.args.get("after") - if after: - print(after) - hits_iter = itertools.dropwhile(lambda hit: hit["title"] != after, hits) - skip = next(hits_iter, None) - if skip: - hits = list(hits_iter) + # Record and apply explicit skip-past + after = flask.request.args.get("after") + if after: + _record_skip(from_title, after) + session_skipped.add(after) - try: - hit, found = get_best_hit(from_title, hits) - except NoGoodHit: - return flask.render_template("all_done.html") + hits = [h for h in hits if h["title"] not in session_skipped + and h["title"] != from_title and h["title"] != case_flip_first(from_title)] + + if not hits: + return flask.render_template("all_done.html") + + saves_this_session = flask.session.get("saves_by_title", {}).get(from_title, 0) return flask.render_template( "article.html", title=from_title, + redirect_to=redirect_to, total=total, with_link=with_link, - hit_title=hit["title"], hits=hits, - replacement=found["replacement"], - diff=found["diff"], - found=found, url_title=url_title, + saves_this_session=saves_this_session, ) @@ -379,7 +397,12 @@ def do_save(title: str, hit_title: str) -> str: """Update page on Wikipedia.""" token = mediawiki_oauth.get_token() - found = get_match(title, hit_title, None) + try: + redirect_to = api.get_wiki_info(title) + except (api.MissingPage, api.MultipleRedirects, api.MediawikiError): + redirect_to = None + + found = get_match(title, hit_title, redirect_to) summary = ( f"link [[{found['replacement']}]] using [[:en:User:Edward/Find link|Find link]]" @@ -417,16 +440,20 @@ def api_hits() -> werkzeug.wrappers.response.Response: @app.route("/api/1/valid_hit") def api_valid_hit() -> werkzeug.wrappers.response.Response: - """Return candidates for the given article title.""" - link_from = flask.request.args["link_from"] + """Check if a candidate article has a valid unlinked mention.""" link_to = flask.request.args["link_to"] + link_from = flask.request.args["link_from"] + redirect_to = flask.request.args.get("redirect_to") or None try: - diff, replacement = get_diff(link_to, link_from, None) + found = get_diff(link_to, link_from, redirect_to) except NoMatch: + _record_skip(link_to, link_from) return flask.jsonify(valid=False) + except api.MediawikiError as e: + return flask.jsonify(valid=False, error=str(e)) - return flask.jsonify(valid=True, diff=diff, replacement=replacement) + return flask.jsonify(valid=True, diff=found["diff"], replacement=found["replacement"]) @app.route("/favicon.ico")