Compare commits

..

1 commit

Author SHA1 Message Date
Edward Betts c3c250134d Pull in some code from one-at-a-time branch 2023-09-29 20:15:02 +01:00
8 changed files with 82 additions and 254 deletions

View file

@ -1,6 +1,7 @@
"""Interface with the mediawiki API.""" """Interface with the mediawiki API."""
from typing import Any from typing import Any
from . import wikidata_oauth from . import wikidata_oauth
wiki_hostname = "en.wikipedia.org" wiki_hostname = "en.wikipedia.org"
@ -30,32 +31,19 @@ def call(params: dict[str, str | int]) -> dict[str, Any]:
return data.json() return data.json()
def article_exists(title: str) -> bool: def get_content(title: str) -> str:
"""Get article text."""
params: dict[str, str | int] = {
"action": "query",
"format": "json",
"formatversion": 2,
"titles": title,
}
return not call(params)["query"]["pages"][0].get("missing")
def get_content(title: str) -> tuple[str, int]:
"""Get article text.""" """Get article text."""
params: dict[str, str | int] = { params: dict[str, str | int] = {
"action": "query", "action": "query",
"format": "json", "format": "json",
"formatversion": 2, "formatversion": 2,
"prop": "revisions|info", "prop": "revisions|info",
"rvprop": "content|timestamp|ids", "rvprop": "content|timestamp",
"titles": title, "titles": title,
} }
data = call(params) data = call(params)
rev = data["query"]["pages"][0]["revisions"][0] rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
content: str = rev["content"] return rev
revid: int = int(rev["revid"])
return content, revid
def compare(title: str, new_text: str) -> str: def compare(title: str, new_text: str) -> str:

View file

@ -68,7 +68,7 @@ def needs_disambig(link: dict[str, Any]) -> bool:
) )
def get_article_links(enwiki: str) -> dict[str, str]: def get_article_links(enwiki: str) -> list[str]:
"""Get links that appear in this article.""" """Get links that appear in this article."""
params: dict[str, str | int] = link_params(enwiki) params: dict[str, str | int] = link_params(enwiki)
@ -92,13 +92,11 @@ def get_article_links(enwiki: str) -> dict[str, str]:
params["gplcontinue"] = data["continue"]["gplcontinue"] params["gplcontinue"] = data["continue"]["gplcontinue"]
sleep(0.1) sleep(0.1)
ret_links = {}
for link in set(links): for link in set(links):
ret_links[link] = link if link in redirects:
for r in redirects.get(link, []): links.update(redirects[link])
ret_links[r] = link
return ret_links return list(links)
# return {link["title"] for link in r.json()["query"]["pages"][0]["links"]} # return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
@ -123,9 +121,10 @@ def delete_toc(root: lxml.html.HtmlElement) -> None:
toc.getparent().remove(toc) toc.getparent().remove(toc)
def get_dab_html(dab_num: int, html: str) -> str: def get_dab_html(dab_num: int, title: str) -> str:
"""Parse dab page and rewrite links.""" """Parse dab page and rewrite links."""
root = lxml.html.fromstring(html) dab_html = get_article_html(title)
root = lxml.html.fromstring(dab_html)
delete_toc(root) delete_toc(root)
element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")} element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")}
@ -161,11 +160,10 @@ class Article:
self.dab_lookup: dict[int, str] = {} self.dab_lookup: dict[int, str] = {}
self.dab_order: list[str] = [] self.dab_order: list[str] = []
self.parse: Optional[dict[str, Any]] = None self.parse: Optional[dict[str, Any]] = None
self.dab_html: dict[str, str] = {}
def preview_endpoint(self) -> str: def save_endpoint(self) -> str:
"""Endpoint for saving changes.""" """Endpoint for saving changes."""
href: str = flask.url_for("preview", enwiki=self.enwiki.replace(" ", "_")) href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_"))
return href return href
def load(self) -> None: def load(self) -> None:
@ -175,34 +173,28 @@ class Article:
def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]: def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
"""Disambiguation links that need fixing.""" """Disambiguation links that need fixing."""
seen = set()
for a in self.root.findall(".//a[@href]"): for a in self.root.findall(".//a[@href]"):
title = a.get("title") title = a.get("title")
if title is not None and title in self.links: if title is None or title not in self.links:
yield a, title, self.links[title]
href = a.get("href")
if not href.startswith("/wiki/"):
continue continue
a.set("href", "https://en.wikipedia.org" + href) a.set("class", "disambig")
a.set("target", "_blank")
def dab_link_to(self): if title in seen:
return [dab["link_to"] for dab in self.dab_list] continue
seen.add(title)
yield a, title
def process_links(self) -> None: def process_links(self) -> None:
"""Process links in parsed wikitext.""" """Process links in parsed wikitext."""
for dab_num, (a, link_to, title) in enumerate(self.iter_links()): for dab_num, (a, title) in enumerate(self.iter_links()):
a.set("class", "disambig")
a.set("id", f"dab-{dab_num}") a.set("id", f"dab-{dab_num}")
if title not in self.dab_html:
self.dab_html[title] = get_article_html(title)
dab: DabItem = { dab: DabItem = {
"num": dab_num, "num": dab_num,
"title": title, "title": title,
"link_to": link_to, "html": get_dab_html(dab_num, title),
"html": get_dab_html(dab_num, self.dab_html[title]),
} }
self.dab_list.append(dab) self.dab_list.append(dab)
self.dab_order.append(title) self.dab_order.append(title)

View file

@ -53,8 +53,8 @@ a.new { color: red; }
<div id="dabs" class="p-3"> <div id="dabs" class="p-3">
<h1>{{ article.enwiki }}</h1> <h1>{{ article.enwiki }}</h1>
<div id="save-panel" class="d-none"> <div id="save-panel" class="d-none">
<form method="POST" action="{{ article.preview_endpoint() }}"> <form method="POST" action="{{ article.save_endpoint() }}">
<button class="btn btn-primary" id="save-btn">Preview before save</button> <button class="btn btn-primary" id="save-btn">Save</button>
<span id="edit-count"></span> <span id="edit-count"></span>
<input type="hidden" value="{}" id="save-edits" name="edits"> <input type="hidden" value="{}" id="save-edits" name="edits">
</form> </form>
@ -62,9 +62,7 @@ a.new { color: red; }
<div>There are {{ article.dab_list | count }} links in the article that need disambiguating.</div> <div>There are {{ article.dab_list | count }} links in the article that need disambiguating.</div>
{% for dab in article.dab_list %} {% for dab in article.dab_list %}
<div class="card p-1 m-2"> <div class="card p-1 m-2">
<div class="card-body">
<h3 class="card-title" id="dab-card-title-{{ dab.num }}" onclick="return jump_to({{ dab.num }})">{{ dab.title }}</h3> <h3 class="card-title" id="dab-card-title-{{ dab.num }}" onclick="return jump_to({{ dab.num }})">{{ dab.title }}</h3>
{% if dab.title != dab.link_to %}<div>redirect from {{ dab.link_to }}</div>{% endif %}
<div> <div>
<a href="#" onclick="return jump_to({{ dab.num }})">highlight link</a> <a href="#" onclick="return jump_to({{ dab.num }})">highlight link</a>
<span class="d-none" id="cancel-{{ dab.num }}"> <span class="d-none" id="cancel-{{ dab.num }}">
@ -72,8 +70,7 @@ a.new { color: red; }
<a href="#" onclick="return cancel_selection({{ dab.num }})">cancel selection</a> <a href="#" onclick="return cancel_selection({{ dab.num }})">cancel selection</a>
</span> </span>
</div> </div>
<div class="dab-article d-none" id="dab-article-{{ dab.num }}">{{ dab.html | safe }}</div> <div class="dab-article" id="dab-article-{{ dab.num }}">{{ dab.html | safe }}</div>
</div>
</div> </div>
{% endfor %} {% endfor %}
</div> </div>
@ -87,38 +84,12 @@ a.new { color: red; }
var edit_set = new Set(); var edit_set = new Set();
var edits = {}; var edits = {};
var dab_lookup = {{ article.dab_lookup | tojson }};
var dab_order = {{ article.dab_order | tojson }}; var dab_order = {{ article.dab_order | tojson }};
var dab_link_to = {{ article.dab_link_to() | tojson }};
var dab_links = document.getElementsByClassName("disambig");
for(var i=0; i<dab_links.length; i++) {
dab_links[i].addEventListener("click", (event) => {
event.preventDefault();
var dab_num = event.target.id.substring(4);
open_dab(dab_num);
});
}
function jump_to(dab_num) { function jump_to(dab_num) {
open_dab(dab_num);
var link = document.getElementById("dab-" + dab_num);
link.scrollIntoView();
link.classList.add("disambig-highlight")
return false;
}
function open_dab(dab_num) {
var highlight_title = "text-bg-primary"; var highlight_title = "text-bg-primary";
var dab_articles = document.getElementsByClassName("dab-article");
for(var i=0; i<dab_articles.length; i++) {
dab_articles[i].classList.add("d-none");
}
var dab_article = document.getElementById("dab-article-" + dab_num);
dab_article.classList.remove("d-none");
var links = document.getElementsByTagName("a"); var links = document.getElementsByTagName("a");
for(var i=0; i<links.length; i++) { for(var i=0; i<links.length; i++) {
links[i].classList.remove("disambig-highlight"); links[i].classList.remove("disambig-highlight");
@ -133,7 +104,9 @@ a.new { color: red; }
card_title.classList.add(highlight_title); card_title.classList.add(highlight_title);
var link = document.getElementById("dab-" + dab_num); var link = document.getElementById("dab-" + dab_num);
link.scrollIntoView();
link.classList.add("disambig-highlight") link.classList.add("disambig-highlight")
return false;
} }
function clear_dab_highlight(dab_num) { function clear_dab_highlight(dab_num) {
@ -158,8 +131,7 @@ a.new { color: red; }
} }
function update_edits() { function update_edits() {
var saves = dab_link_to.map((link_to, num) => ( var saves = dab_order.filter(t => edits[t]).map(t => [t, edits[t]]);
{"num": num, "link_to": link_to, "title": edits[num]}));
var save_edits = document.getElementById("save-edits"); var save_edits = document.getElementById("save-edits");
save_edits.value = JSON.stringify(saves); save_edits.value = JSON.stringify(saves);
} }
@ -169,7 +141,7 @@ a.new { color: red; }
document.getElementById("cancel-" + dab_num).classList.remove("d-none"); document.getElementById("cancel-" + dab_num).classList.remove("d-none");
var title = element.getAttribute("title"); var title = element.getAttribute("title");
edits[dab_num] = title; edits[dab_lookup[dab_num]] = title;
edit_set.add(dab_num); edit_set.add(dab_num);
update_edits(); update_edits();
@ -191,7 +163,7 @@ a.new { color: red; }
} }
function cancel_selection(dab_num) { function cancel_selection(dab_num) {
delete edits[dab_num]; delete edits[dab_lookup[dab_num]];
document.getElementById("cancel-" + dab_num).classList.add("d-none"); document.getElementById("cancel-" + dab_num).classList.add("d-none");
clear_dab_highlight(dab_num); clear_dab_highlight(dab_num);
edit_set.delete(dab_num); edit_set.delete(dab_num);

View file

@ -1,21 +1,7 @@
{% extends "base.html" %} {% extends "base.html" %}
{% block title %}DAB Mechanic{% endblock %}
{% block content %} {% block content %}
<div class="m-3"> <div class="m-3">
<form>
article title:
<input name="title" value="{{ request.args.get("title", "") }}">
<button class="btn btn-sm btn-primary">go</button>
</form>
{% if title and not exists %}
<p>No article titled "{{ title }}" found in Wikipedia.</p>
{% endif %}
<ol> <ol>
{% for enwiki, count in articles %} {% for enwiki, count in articles %}
<li> <li>

View file

@ -15,7 +15,13 @@
<a class="navbar-brand" href="{{ url_for('index') }}">Dab Mechanic</a> <a class="navbar-brand" href="{{ url_for('index') }}">Dab Mechanic</a>
<ul class="navbar-nav me-auto mb-2 mb-lg-0"> <ul class="navbar-nav me-auto mb-2 mb-lg-0">
<li class="nav-item"> <li class="nav-item">
<a class="nav-link active" aria-current="page" href="/">Home</a> <a class="nav-link active" aria-current="page" href="#">Home</a>
</li>
<li class="nav-item">
<a class="nav-link" href="#">Link</a>
</li>
<li class="nav-item">
<a class="nav-link disabled">Disabled</a>
</li> </li>
</ul> </ul>

View file

@ -1,39 +0,0 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>{{ title }} &ndash; dab mechanic</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.2.0/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-gH2yIJqKdNHPEq0n4Mqa/HGKIhSkIHeL5AyhkYV8i59U5AR6csBvApHHNl/vI1Bx" crossorigin="anonymous">
<link rel="stylesheet" href="https://www.mediawiki.org/w/load.php?modules=mediawiki.diff.styles&only=styles">
</head>
<body>
<div class="m-3">
<h2>Preview of changes: {{ title }}</h2>
<div class="card">
<div class="card-body">
<h5 class="card-title">Edit summary</h5>
<p class="card-text">{{ edit_summary }}</p>
</div>
</div>
{# <pre>{{ text }}</pre> #}
<table class="diff my-3">
<colgroup>
<col class="diff-marker">
<col class="diff-content">
<col class="diff-marker">
<col class="diff-content">
</colgroup>
<tbody>
{{ diff | safe }}
</tbody>
</table>
<form method="POST" action="{{ url_for("save", enwiki=title) }}">
<button class="btn btn-primary" id="save-btn">Save changes</button>
<input type="hidden" value="{{ request.form.edits }}" id="save-edits" name="edits">
</form>
</body>
</html>

18
templates/save.html Normal file
View file

@ -0,0 +1,18 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>{{ title }} &ndash; dab mechanic</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.2.0/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-gH2yIJqKdNHPEq0n4Mqa/HGKIhSkIHeL5AyhkYV8i59U5AR6csBvApHHNl/vI1Bx" crossorigin="anonymous">
</head>
<body>
<div class="m-3">
<h2>Save edits: {{ title }}</h2>
<p>Edit summary: {{ edit_summary }}</p>
</div>
<div class="m-3">
<pre>{{ text }}</pre>
</div>
</body>
</html>

View file

@ -5,11 +5,10 @@ import json
import re import re
import sys import sys
import traceback import traceback
from typing import Optional, TypedDict from typing import Optional
import flask import flask
import lxml.html import lxml.html
import mwparserfromhell
import requests import requests
import werkzeug.exceptions import werkzeug.exceptions
from requests_oauthlib import OAuth1Session from requests_oauthlib import OAuth1Session
@ -78,48 +77,30 @@ def parse_articles_with_dab_links(root: lxml.html.HtmlElement) -> list[tuple[str
@app.route("/") @app.route("/")
def index() -> str | Response: def index() -> str:
"""Index page.""" """Index page."""
title = flask.request.args.get("title")
exists = None
if title:
title = title.strip()
exists = mediawiki_api.article_exists(title)
if exists:
return flask.redirect(
flask.url_for("article_page", enwiki=title.replace(" ", "_"))
)
r = requests.get(awdl_url, params={"limit": 100}) r = requests.get(awdl_url, params={"limit": 100})
root = lxml.html.fromstring(r.content) root = lxml.html.fromstring(r.content)
articles = parse_articles_with_dab_links(root) articles = parse_articles_with_dab_links(root)
# articles = [line[:-1] for line in open("article_list")] # articles = [line[:-1] for line in open("article_list")]
return flask.render_template( return flask.render_template("index.html", articles=articles)
"index.html",
title=title,
exists=exists,
articles=articles,
)
class Edit(TypedDict): def make_disamb_link(edit: tuple[str, str]) -> str:
"""Edit to an article.""" """Given an edit return the appropriate link."""
return f"[[{edit[1]}|{edit[0]}]]"
num: int
link_to: str
title: str
def old_apply_edits(article_text: str, edits: list[Edit]) -> str: def apply_edits(article_text: str, edits: list[tuple[str, str]]) -> str:
"""Apply edits to article text.""" """Apply edits to article text."""
def escape(s: str) -> str: def escape(s: str) -> str:
return re.escape(s).replace("_", "[ _]").replace(r"\ ", "[ _]") return re.escape(s).replace("_", "[ _]").replace(r"\ ", "[ _]")
for edit in edits: for link_from, link_to in edits:
# print(rf"\[\[{escape(link_from)}\]\]") print(rf"\[\[{escape(link_from)}\]\]")
article_text = re.sub( article_text = re.sub(
rf"\[\[{escape(link_from)}\]\]", rf"\[\[{escape(link_from)}\]\]",
f"[[{link_to}|{link_from}]]", f"[[{link_to}|{link_from}]]",
@ -129,107 +110,34 @@ def old_apply_edits(article_text: str, edits: list[Edit]) -> str:
return article_text return article_text
def make_disamb_link(edit: Edit) -> str: @app.route("/save/<path:enwiki>", methods=["POST"])
"""Given an edit return the appropriate link.""" def save(enwiki: str) -> Response | str:
return f"[[{edit['title']}|{edit['link_to']}]]" """Save edits to article."""
edits = [
(link_to, link_from)
for link_to, link_from in json.loads(flask.request.form["edits"])
]
enwiki = enwiki.replace("_", " ")
def build_edit_summary(edits: list[Edit]) -> str:
"""Given a list of edits return an edit summary."""
titles = ", ".join(make_disamb_link(edit) for edit in edits[:-1]) titles = ", ".join(make_disamb_link(edit) for edit in edits[:-1])
if len(titles) > 1: if len(titles) > 1:
titles += " and " titles += " and "
titles += make_disamb_link(edits[-1]) titles += make_disamb_link(edits[-1])
return f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]" edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]"
article_text = apply_edits(mediawiki_api.get_content(enwiki), edits)
def get_links(wikicode, dab_links):
edits = [edit for edit in dab_links if edit.get("title")]
dab_titles = {dab["link_to"] for dab in edits}
return [
link for link in wikicode.filter_wikilinks() if str(link.title) in dab_titles
]
def apply_edits(text, dab_links):
wikicode = mwparserfromhell.parse(text)
links = get_links(wikicode, dab_links)
if len(links) != len(dab_links):
print("links:", len(links))
print("dab_links:", len(dab_links))
print("dab_links:", dab_links)
assert len(links) == len(dab_links)
for wikilink, edit in zip(links, dab_links):
if not edit.get("title"):
continue
if not wikilink.text:
wikilink.text = wikilink.title
wikilink.title = edit["title"]
return str(wikicode)
@app.route("/preview/<path:enwiki>", methods=["POST"])
def preview(enwiki: str) -> Response | str:
"""Preview article edits."""
enwiki = enwiki.replace("_", " ")
dab_links = json.loads(flask.request.form["edits"])
dab_links = [link for link in dab_links if "title" in link]
cur_text, baserevid = mediawiki_api.get_content(enwiki)
text = apply_edits(cur_text, dab_links)
diff = mediawiki_api.compare(enwiki, text)
return flask.render_template( return flask.render_template(
"preview.html", "save.html",
edit_summary=build_edit_summary(dab_links), edit_summary=edit_summary,
title=enwiki, title=enwiki,
edits=dab_links, edits=edits,
diff=diff, text=article_text,
) )
def do_save(enwiki: str):
"""Update page on Wikipedia."""
dab_links = json.loads(flask.request.form["edits"])
dab_links = [link for link in dab_links if "title" in link]
cur_text, baserevid = mediawiki_api.get_content(enwiki)
new_text = apply_edits(cur_text, dab_links)
token = wikidata_oauth.get_token()
summary = build_edit_summary(dab_links)
print(summary)
edit = mediawiki_api.edit_page(
title=enwiki,
text=new_text,
summary=summary,
baserevid=baserevid,
token=token,
)
return edit
@app.route("/save/<path:enwiki>", methods=["GET", "POST"])
def save(enwiki: str) -> Response | str:
"""Save edits to article."""
enwiki_norm = enwiki.replace("_", " ")
if flask.request.method == "GET":
return flask.render_template("edit_saved.html", title=enwiki_norm)
do_save(enwiki_norm)
return flask.redirect(flask.url_for(flask.request.endpoint, enwiki=enwiki))
def redirect_if_needed(enwiki: str) -> Optional[Response]: def redirect_if_needed(enwiki: str) -> Optional[Response]:
"""Check if there are spaces in the article name and redirect.""" """Check if there are spaces in the article name and redirect."""
endpoint = flask.request.endpoint endpoint = flask.request.endpoint
@ -248,9 +156,6 @@ def article_page(enwiki: str) -> Response | str:
if redirect: if redirect:
return redirect return redirect
if "owner_key" not in flask.session:
return flask.render_template("login_needed.html")
article = wikipedia.Article(enwiki) article = wikipedia.Article(enwiki)
article.load() article.load()
article.process_links() article.process_links()