diff --git a/dab_mechanic/mediawiki_api.py b/dab_mechanic/mediawiki_api.py index 26d7a20..265cb73 100644 --- a/dab_mechanic/mediawiki_api.py +++ b/dab_mechanic/mediawiki_api.py @@ -30,16 +30,62 @@ def call(params: dict[str, str | int]) -> dict[str, Any]: return data.json() -def get_content(title: str) -> str: +def article_exists(title: str) -> bool: + """Get article text.""" + params: dict[str, str | int] = { + "action": "query", + "format": "json", + "formatversion": 2, + "titles": title, + } + return not call(params)["query"]["pages"][0].get("missing") + + +def get_content(title: str) -> tuple[str, int]: """Get article text.""" params: dict[str, str | int] = { "action": "query", "format": "json", "formatversion": 2, "prop": "revisions|info", - "rvprop": "content|timestamp", + "rvprop": "content|timestamp|ids", "titles": title, } data = call(params) - rev: str = data["query"]["pages"][0]["revisions"][0]["content"] - return rev + rev = data["query"]["pages"][0]["revisions"][0] + content: str = rev["content"] + revid: int = int(rev["revid"]) + return content, revid + + +def compare(title: str, new_text: str) -> str: + """Generate a diff for the new article text.""" + params: dict[str, str | int] = { + "format": "json", + "formatversion": 2, + "action": "compare", + "fromtitle": title, + "toslots": "main", + "totext-main": new_text, + "prop": "diff", + } + diff: str = call(params)["compare"]["body"] + return diff + + +def edit_page( + title: str, text: str, summary: str, baserevid: str, token: str +) -> dict[str, str | int]: + """Edit a page on Wikipedia.""" + params: dict[str, str | int] = { + "format": "json", + "formatversion": 2, + "action": "edit", + "title": title, + "text": text, + "baserevid": baserevid, + "token": token, + "summary": summary, + } + edit: str = call(params)["edit"] + return edit diff --git a/dab_mechanic/wikidata_oauth.py b/dab_mechanic/wikidata_oauth.py index 5af0976..5b048b2 100644 --- a/dab_mechanic/wikidata_oauth.py +++ b/dab_mechanic/wikidata_oauth.py @@ -3,8 +3,10 @@ from urllib.parse import urlencode from flask import current_app, session from requests_oauthlib import OAuth1Session -wiki_hostname = "en.wikipedia.org" -api_url = f"https://{wiki_hostname}/w/api.php" +WIKI_HOSTNAME = "en.wikipedia.org" +API_URL = f"https://{WIKI_HOSTNAME}/w/api.php" + +TIMEOUT = 20 def get_edit_proxy() -> dict[str, str]: @@ -28,12 +30,12 @@ def api_post_request(params: dict[str, str | int]): resource_owner_secret=session["owner_secret"], ) proxies = get_edit_proxy() - return oauth.post(api_url, data=params, timeout=10, proxies=proxies) + return oauth.post(API_URL, data=params, timeout=TIMEOUT, proxies=proxies) def raw_request(params): app = current_app - url = api_url + "?" + urlencode(params) + url = API_URL + "?" + urlencode(params) client_key = app.config["CLIENT_KEY"] client_secret = app.config["CLIENT_SECRET"] oauth = OAuth1Session( @@ -43,7 +45,7 @@ def raw_request(params): resource_owner_secret=session["owner_secret"], ) proxies = get_edit_proxy() - return oauth.get(url, timeout=10, proxies=proxies) + return oauth.get(url, timeout=TIMEOUT, proxies=proxies) def api_request(params): diff --git a/dab_mechanic/wikipedia.py b/dab_mechanic/wikipedia.py index 57c03c4..fecf7f2 100644 --- a/dab_mechanic/wikipedia.py +++ b/dab_mechanic/wikipedia.py @@ -68,7 +68,7 @@ def needs_disambig(link: dict[str, Any]) -> bool: ) -def get_article_links(enwiki: str) -> list[str]: +def get_article_links(enwiki: str) -> dict[str, str]: """Get links that appear in this article.""" params: dict[str, str | int] = link_params(enwiki) @@ -92,11 +92,13 @@ def get_article_links(enwiki: str) -> list[str]: params["gplcontinue"] = data["continue"]["gplcontinue"] sleep(0.1) + ret_links = {} for link in set(links): - if link in redirects: - links.update(redirects[link]) + ret_links[link] = link + for r in redirects.get(link, []): + ret_links[r] = link - return list(links) + return ret_links # return {link["title"] for link in r.json()["query"]["pages"][0]["links"]} @@ -121,10 +123,9 @@ def delete_toc(root: lxml.html.HtmlElement) -> None: toc.getparent().remove(toc) -def get_dab_html(dab_num: int, title: str) -> str: +def get_dab_html(dab_num: int, html: str) -> str: """Parse dab page and rewrite links.""" - dab_html = get_article_html(title) - root = lxml.html.fromstring(dab_html) + root = lxml.html.fromstring(html) delete_toc(root) element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")} @@ -160,10 +161,11 @@ class Article: self.dab_lookup: dict[int, str] = {} self.dab_order: list[str] = [] self.parse: Optional[dict[str, Any]] = None + self.dab_html: dict[str, str] = {} - def save_endpoint(self) -> str: + def preview_endpoint(self) -> str: """Endpoint for saving changes.""" - href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_")) + href: str = flask.url_for("preview", enwiki=self.enwiki.replace(" ", "_")) return href def load(self) -> None: @@ -173,28 +175,34 @@ class Article: def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]: """Disambiguation links that need fixing.""" - seen = set() for a in self.root.findall(".//a[@href]"): title = a.get("title") - if title is None or title not in self.links: - continue - a.set("class", "disambig") + if title is not None and title in self.links: + yield a, title, self.links[title] - if title in seen: + href = a.get("href") + if not href.startswith("/wiki/"): continue - seen.add(title) + a.set("href", "https://en.wikipedia.org" + href) + a.set("target", "_blank") - yield a, title + def dab_link_to(self): + return [dab["link_to"] for dab in self.dab_list] def process_links(self) -> None: """Process links in parsed wikitext.""" - for dab_num, (a, title) in enumerate(self.iter_links()): + for dab_num, (a, link_to, title) in enumerate(self.iter_links()): + a.set("class", "disambig") a.set("id", f"dab-{dab_num}") + if title not in self.dab_html: + self.dab_html[title] = get_article_html(title) + dab: DabItem = { "num": dab_num, "title": title, - "html": get_dab_html(dab_num, title), + "link_to": link_to, + "html": get_dab_html(dab_num, self.dab_html[title]), } self.dab_list.append(dab) self.dab_order.append(title) diff --git a/templates/article.html b/templates/article.html index 7c9afbf..eb1e62d 100644 --- a/templates/article.html +++ b/templates/article.html @@ -53,8 +53,8 @@ a.new { color: red; }

{{ article.enwiki }}

-
- + +
@@ -62,7 +62,9 @@ a.new { color: red; }
There are {{ article.dab_list | count }} links in the article that need disambiguating.
{% for dab in article.dab_list %}
-

{{ dab.title }}

+
+

{{ dab.title }}

+ {% if dab.title != dab.link_to %}
redirect from {{ dab.link_to }}
{% endif %}
highlight link @@ -70,7 +72,8 @@ a.new { color: red; } cancel selection
-
{{ dab.html | safe }}
+
{{ dab.html | safe }}
+
{% endfor %}
@@ -84,12 +87,38 @@ a.new { color: red; } var edit_set = new Set(); var edits = {}; - var dab_lookup = {{ article.dab_lookup | tojson }}; var dab_order = {{ article.dab_order | tojson }}; + var dab_link_to = {{ article.dab_link_to() | tojson }}; + + var dab_links = document.getElementsByClassName("disambig"); + for(var i=0; i { + event.preventDefault(); + var dab_num = event.target.id.substring(4); + open_dab(dab_num); + }); + } function jump_to(dab_num) { + open_dab(dab_num); + + var link = document.getElementById("dab-" + dab_num); + link.scrollIntoView(); + link.classList.add("disambig-highlight") + return false; + } + + function open_dab(dab_num) { var highlight_title = "text-bg-primary"; + var dab_articles = document.getElementsByClassName("dab-article"); + for(var i=0; i edits[t]).map(t => [t, edits[t]]); + var saves = dab_link_to.map((link_to, num) => ( + {"num": num, "link_to": link_to, "title": edits[num]})); var save_edits = document.getElementById("save-edits"); save_edits.value = JSON.stringify(saves); } @@ -141,7 +169,7 @@ a.new { color: red; } document.getElementById("cancel-" + dab_num).classList.remove("d-none"); var title = element.getAttribute("title"); - edits[dab_lookup[dab_num]] = title; + edits[dab_num] = title; edit_set.add(dab_num); update_edits(); @@ -163,7 +191,7 @@ a.new { color: red; } } function cancel_selection(dab_num) { - delete edits[dab_lookup[dab_num]]; + delete edits[dab_num]; document.getElementById("cancel-" + dab_num).classList.add("d-none"); clear_dab_highlight(dab_num); edit_set.delete(dab_num); diff --git a/templates/index.html b/templates/index.html index 1758d8e..d6d6ca3 100644 --- a/templates/index.html +++ b/templates/index.html @@ -1,7 +1,21 @@ {% extends "base.html" %} +{% block title %}DAB Mechanic{% endblock %} + {% block content %}
+ +
+ article title: + + +
+ + {% if title and not exists %} +

No article titled "{{ title }}" found in Wikipedia.

+ {% endif %} + +
    {% for enwiki, count in articles %}
  1. diff --git a/templates/navbar.html b/templates/navbar.html index 9ce6f07..0b52925 100644 --- a/templates/navbar.html +++ b/templates/navbar.html @@ -15,13 +15,7 @@ Dab Mechanic diff --git a/templates/preview.html b/templates/preview.html new file mode 100644 index 0000000..cd4bb3a --- /dev/null +++ b/templates/preview.html @@ -0,0 +1,39 @@ + + + + + {{ title }} – dab mechanic + + + + + +
    +

    Preview of changes: {{ title }}

    +
    +
    +
    Edit summary
    +

    {{ edit_summary }}

    +
    +
    + {#
    {{ text }}
    #} + + + + + + + + + + {{ diff | safe }} + +
    + +
    + + +
    + + + diff --git a/templates/save.html b/templates/save.html deleted file mode 100644 index 7bd28a5..0000000 --- a/templates/save.html +++ /dev/null @@ -1,18 +0,0 @@ - - - - - {{ title }} – dab mechanic - - - - -
    -

    Save edits: {{ title }}

    -

    Edit summary: {{ edit_summary }}

    -
    -
    -
    {{ text }}
    -
    - - diff --git a/web_view.py b/web_view.py index 2730f23..aa962bc 100755 --- a/web_view.py +++ b/web_view.py @@ -3,7 +3,9 @@ import inspect import json import re -from typing import Optional +from typing import Optional, TypedDict +import mwparserfromhell +from pprint import pprint import flask import lxml.html @@ -64,28 +66,43 @@ def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, in @app.route("/") def index(): + title = flask.request.args.get("title") + exists = None + if title: + title = title.strip() + exists = mediawiki_api.article_exists(title) + if exists: + return flask.redirect( + flask.url_for("article_page", enwiki=title.replace(" ", "_")) + ) + r = requests.get(awdl_url, params={"limit": 100}) root = lxml.html.fromstring(r.content) articles = parse_articles_with_dab_links(root) # articles = [line[:-1] for line in open("article_list")] - return flask.render_template("index.html", articles=articles) + return flask.render_template( + "index.html", title=title, exists=exists, articles=articles, + ) -def make_disamb_link(edit: tuple[str, str]) -> str: - """Given an edit return the appropriate link.""" - return f"[[{edit[1]}|{edit[0]}]]" +class Edit(TypedDict): + """Edit to an article.""" + + num: int + link_to: str + title: str -def apply_edits(article_text: str, edits: list[tuple[str, str]]) -> str: +def old_apply_edits(article_text: str, edits: list[Edit]) -> str: """Apply edits to article text.""" def escape(s: str) -> str: return re.escape(s).replace("_", "[ _]").replace(r"\ ", "[ _]") - for link_from, link_to in edits: - print(rf"\[\[{escape(link_from)}\]\]") + for edit in edits: + # print(rf"\[\[{escape(link_from)}\]\]") article_text = re.sub( rf"\[\[{escape(link_from)}\]\]", f"[[{link_to}|{link_from}]]", @@ -95,34 +112,107 @@ def apply_edits(article_text: str, edits: list[tuple[str, str]]) -> str: return article_text -@app.route("/save/", methods=["POST"]) -def save(enwiki: str) -> Response | str: - """Save edits to article.""" - edits = [ - (link_to, link_from) - for link_to, link_from in json.loads(flask.request.form["edits"]) - ] +def make_disamb_link(edit: Edit) -> str: + """Given an edit return the appropriate link.""" + return f"[[{edit['title']}|{edit['link_to']}]]" - enwiki = enwiki.replace("_", " ") + +def build_edit_summary(edits: list[Edit]) -> str: + """Given a list of edits return an edit summary.""" titles = ", ".join(make_disamb_link(edit) for edit in edits[:-1]) if len(titles) > 1: titles += " and " titles += make_disamb_link(edits[-1]) - edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]" + return f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]" - article_text = apply_edits(mediawiki_api.get_content(enwiki), edits) + +def get_links(wikicode, dab_links): + edits = [edit for edit in dab_links if edit.get("title")] + + dab_titles = {dab["link_to"] for dab in edits} + return [ + link for link in wikicode.filter_wikilinks() if str(link.title) in dab_titles + ] + + +def apply_edits(text, dab_links): + wikicode = mwparserfromhell.parse(text) + links = get_links(wikicode, dab_links) + if len(links) != len(dab_links): + print("links:", len(links)) + print("dab_links:", len(dab_links)) + print("dab_links:", dab_links) + assert len(links) == len(dab_links) + + for wikilink, edit in zip(links, dab_links): + if not edit.get("title"): + continue + if not wikilink.text: + wikilink.text = wikilink.title + wikilink.title = edit["title"] + + return str(wikicode) + + +@app.route("/preview/", methods=["POST"]) +def preview(enwiki: str) -> Response | str: + """Preview article edits.""" + enwiki = enwiki.replace("_", " ") + + dab_links = json.loads(flask.request.form["edits"]) + dab_links = [link for link in dab_links if "title" in link] + cur_text, baserevid = mediawiki_api.get_content(enwiki) + + text = apply_edits(cur_text, dab_links) + diff = mediawiki_api.compare(enwiki, text) return flask.render_template( - "save.html", - edit_summary=edit_summary, + "preview.html", + edit_summary=build_edit_summary(dab_links), title=enwiki, - edits=edits, - text=article_text, + edits=dab_links, + diff=diff, ) +def do_save(enwiki: str): + """Update page on Wikipedia.""" + dab_links = json.loads(flask.request.form["edits"]) + dab_links = [link for link in dab_links if "title" in link] + + cur_text, baserevid = mediawiki_api.get_content(enwiki) + + new_text = apply_edits(cur_text, dab_links) + token = wikidata_oauth.get_token() + + summary = build_edit_summary(dab_links) + print(summary) + + edit = mediawiki_api.edit_page( + title=enwiki, + text=new_text, + summary=summary, + baserevid=baserevid, + token=token, + ) + + return edit + + +@app.route("/save/", methods=["GET", "POST"]) +def save(enwiki: str) -> Response | str: + """Save edits to article.""" + enwiki_norm = enwiki.replace("_", " ") + + if flask.request.method == "GET": + return flask.render_template("edit_saved.html", title=enwiki_norm) + + do_save(enwiki_norm) + return flask.redirect(flask.url_for(flask.request.endpoint, enwiki=enwiki)) + + def redirect_if_needed(enwiki: str) -> Optional[Response]: """Check if there are spaces in the article name and redirect.""" return ( @@ -141,6 +231,9 @@ def article_page(enwiki: str) -> Response: if redirect: return redirect + if "owner_key" not in flask.session: + return flask.render_template("login_needed.html") + article = wikipedia.Article(enwiki) article.load() article.process_links()