Compare commits

...

9 commits

9 changed files with 295 additions and 85 deletions

View file

@ -30,16 +30,62 @@ def call(params: dict[str, str | int]) -> dict[str, Any]:
return data.json() return data.json()
def get_content(title: str) -> str: def article_exists(title: str) -> bool:
"""Get article text."""
params: dict[str, str | int] = {
"action": "query",
"format": "json",
"formatversion": 2,
"titles": title,
}
return not call(params)["query"]["pages"][0].get("missing")
def get_content(title: str) -> tuple[str, int]:
"""Get article text.""" """Get article text."""
params: dict[str, str | int] = { params: dict[str, str | int] = {
"action": "query", "action": "query",
"format": "json", "format": "json",
"formatversion": 2, "formatversion": 2,
"prop": "revisions|info", "prop": "revisions|info",
"rvprop": "content|timestamp", "rvprop": "content|timestamp|ids",
"titles": title, "titles": title,
} }
data = call(params) data = call(params)
rev: str = data["query"]["pages"][0]["revisions"][0]["content"] rev = data["query"]["pages"][0]["revisions"][0]
return rev content: str = rev["content"]
revid: int = int(rev["revid"])
return content, revid
def compare(title: str, new_text: str) -> str:
"""Generate a diff for the new article text."""
params: dict[str, str | int] = {
"format": "json",
"formatversion": 2,
"action": "compare",
"fromtitle": title,
"toslots": "main",
"totext-main": new_text,
"prop": "diff",
}
diff: str = call(params)["compare"]["body"]
return diff
def edit_page(
title: str, text: str, summary: str, baserevid: str, token: str
) -> dict[str, str | int]:
"""Edit a page on Wikipedia."""
params: dict[str, str | int] = {
"format": "json",
"formatversion": 2,
"action": "edit",
"title": title,
"text": text,
"baserevid": baserevid,
"token": token,
"summary": summary,
}
edit: str = call(params)["edit"]
return edit

View file

@ -7,8 +7,10 @@ from flask import current_app, session
from requests.models import Response from requests.models import Response
from requests_oauthlib import OAuth1Session from requests_oauthlib import OAuth1Session
wiki_hostname = "en.wikipedia.org" WIKI_HOSTNAME = "en.wikipedia.org"
api_url = f"https://{wiki_hostname}/w/api.php" API_URL = f"https://{WIKI_HOSTNAME}/w/api.php"
TIMEOUT = 20
CallParams = dict[str, str | int] CallParams = dict[str, str | int]
@ -33,14 +35,15 @@ def api_post_request(params: CallParams) -> Response:
resource_owner_key=session["owner_key"], resource_owner_key=session["owner_key"],
resource_owner_secret=session["owner_secret"], resource_owner_secret=session["owner_secret"],
) )
r: Response = oauth.post(api_url, data=params, timeout=10, proxies=get_edit_proxy()) proxies = get_edit_proxy()
r: Response = oauth.post(API_URL, data=params, timeout=TIMEOUT, proxies=proxies)
return r return r
def raw_request(params: CallParams) -> Response: def raw_request(params: CallParams) -> Response:
"""Raw request.""" """Raw request."""
app = current_app app = current_app
url = api_url + "?" + urlencode(params) url = API_URL + "?" + urlencode(params)
client_key = app.config["CLIENT_KEY"] client_key = app.config["CLIENT_KEY"]
client_secret = app.config["CLIENT_SECRET"] client_secret = app.config["CLIENT_SECRET"]
oauth = OAuth1Session( oauth = OAuth1Session(
@ -49,7 +52,8 @@ def raw_request(params: CallParams) -> Response:
resource_owner_key=session["owner_key"], resource_owner_key=session["owner_key"],
resource_owner_secret=session["owner_secret"], resource_owner_secret=session["owner_secret"],
) )
r: Response = oauth.get(url, timeout=10, proxies=get_edit_proxy()) proxies = get_edit_proxy()
r: Response = oauth.get(url, timeout=TIMEOUT, proxies=proxies)
return r return r

View file

@ -68,7 +68,7 @@ def needs_disambig(link: dict[str, Any]) -> bool:
) )
def get_article_links(enwiki: str) -> list[str]: def get_article_links(enwiki: str) -> dict[str, str]:
"""Get links that appear in this article.""" """Get links that appear in this article."""
params: dict[str, str | int] = link_params(enwiki) params: dict[str, str | int] = link_params(enwiki)
@ -92,11 +92,13 @@ def get_article_links(enwiki: str) -> list[str]:
params["gplcontinue"] = data["continue"]["gplcontinue"] params["gplcontinue"] = data["continue"]["gplcontinue"]
sleep(0.1) sleep(0.1)
ret_links = {}
for link in set(links): for link in set(links):
if link in redirects: ret_links[link] = link
links.update(redirects[link]) for r in redirects.get(link, []):
ret_links[r] = link
return list(links) return ret_links
# return {link["title"] for link in r.json()["query"]["pages"][0]["links"]} # return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
@ -121,10 +123,9 @@ def delete_toc(root: lxml.html.HtmlElement) -> None:
toc.getparent().remove(toc) toc.getparent().remove(toc)
def get_dab_html(dab_num: int, title: str) -> str: def get_dab_html(dab_num: int, html: str) -> str:
"""Parse dab page and rewrite links.""" """Parse dab page and rewrite links."""
dab_html = get_article_html(title) root = lxml.html.fromstring(html)
root = lxml.html.fromstring(dab_html)
delete_toc(root) delete_toc(root)
element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")} element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")}
@ -160,10 +161,11 @@ class Article:
self.dab_lookup: dict[int, str] = {} self.dab_lookup: dict[int, str] = {}
self.dab_order: list[str] = [] self.dab_order: list[str] = []
self.parse: Optional[dict[str, Any]] = None self.parse: Optional[dict[str, Any]] = None
self.dab_html: dict[str, str] = {}
def save_endpoint(self) -> str: def preview_endpoint(self) -> str:
"""Endpoint for saving changes.""" """Endpoint for saving changes."""
href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_")) href: str = flask.url_for("preview", enwiki=self.enwiki.replace(" ", "_"))
return href return href
def load(self) -> None: def load(self) -> None:
@ -173,28 +175,34 @@ class Article:
def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]: def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
"""Disambiguation links that need fixing.""" """Disambiguation links that need fixing."""
seen = set()
for a in self.root.findall(".//a[@href]"): for a in self.root.findall(".//a[@href]"):
title = a.get("title") title = a.get("title")
if title is None or title not in self.links: if title is not None and title in self.links:
continue yield a, title, self.links[title]
a.set("class", "disambig")
if title in seen: href = a.get("href")
if not href.startswith("/wiki/"):
continue continue
seen.add(title) a.set("href", "https://en.wikipedia.org" + href)
a.set("target", "_blank")
yield a, title def dab_link_to(self):
return [dab["link_to"] for dab in self.dab_list]
def process_links(self) -> None: def process_links(self) -> None:
"""Process links in parsed wikitext.""" """Process links in parsed wikitext."""
for dab_num, (a, title) in enumerate(self.iter_links()): for dab_num, (a, link_to, title) in enumerate(self.iter_links()):
a.set("class", "disambig")
a.set("id", f"dab-{dab_num}") a.set("id", f"dab-{dab_num}")
if title not in self.dab_html:
self.dab_html[title] = get_article_html(title)
dab: DabItem = { dab: DabItem = {
"num": dab_num, "num": dab_num,
"title": title, "title": title,
"html": get_dab_html(dab_num, title), "link_to": link_to,
"html": get_dab_html(dab_num, self.dab_html[title]),
} }
self.dab_list.append(dab) self.dab_list.append(dab)
self.dab_order.append(title) self.dab_order.append(title)

View file

@ -53,8 +53,8 @@ a.new { color: red; }
<div id="dabs" class="p-3"> <div id="dabs" class="p-3">
<h1>{{ article.enwiki }}</h1> <h1>{{ article.enwiki }}</h1>
<div id="save-panel" class="d-none"> <div id="save-panel" class="d-none">
<form method="POST" action="{{ article.save_endpoint() }}"> <form method="POST" action="{{ article.preview_endpoint() }}">
<button class="btn btn-primary" id="save-btn">Save</button> <button class="btn btn-primary" id="save-btn">Preview before save</button>
<span id="edit-count"></span> <span id="edit-count"></span>
<input type="hidden" value="{}" id="save-edits" name="edits"> <input type="hidden" value="{}" id="save-edits" name="edits">
</form> </form>
@ -62,7 +62,9 @@ a.new { color: red; }
<div>There are {{ article.dab_list | count }} links in the article that need disambiguating.</div> <div>There are {{ article.dab_list | count }} links in the article that need disambiguating.</div>
{% for dab in article.dab_list %} {% for dab in article.dab_list %}
<div class="card p-1 m-2"> <div class="card p-1 m-2">
<h3 class="card-title" id="dab-card-title-{{ dab.num }}" onclick="return jump_to({{ dab.num }})">{{ dab.title }}</h3> <div class="card-body">
<h3 class="card-title" id="dab-card-title-{{ dab.num }}" onclick="return jump_to({{ dab.num }})">{{ dab.title }}</h3>
{% if dab.title != dab.link_to %}<div>redirect from {{ dab.link_to }}</div>{% endif %}
<div> <div>
<a href="#" onclick="return jump_to({{ dab.num }})">highlight link</a> <a href="#" onclick="return jump_to({{ dab.num }})">highlight link</a>
<span class="d-none" id="cancel-{{ dab.num }}"> <span class="d-none" id="cancel-{{ dab.num }}">
@ -70,7 +72,8 @@ a.new { color: red; }
<a href="#" onclick="return cancel_selection({{ dab.num }})">cancel selection</a> <a href="#" onclick="return cancel_selection({{ dab.num }})">cancel selection</a>
</span> </span>
</div> </div>
<div class="dab-article" id="dab-article-{{ dab.num }}">{{ dab.html | safe }}</div> <div class="dab-article d-none" id="dab-article-{{ dab.num }}">{{ dab.html | safe }}</div>
</div>
</div> </div>
{% endfor %} {% endfor %}
</div> </div>
@ -84,12 +87,38 @@ a.new { color: red; }
var edit_set = new Set(); var edit_set = new Set();
var edits = {}; var edits = {};
var dab_lookup = {{ article.dab_lookup | tojson }};
var dab_order = {{ article.dab_order | tojson }}; var dab_order = {{ article.dab_order | tojson }};
var dab_link_to = {{ article.dab_link_to() | tojson }};
var dab_links = document.getElementsByClassName("disambig");
for(var i=0; i<dab_links.length; i++) {
dab_links[i].addEventListener("click", (event) => {
event.preventDefault();
var dab_num = event.target.id.substring(4);
open_dab(dab_num);
});
}
function jump_to(dab_num) { function jump_to(dab_num) {
open_dab(dab_num);
var link = document.getElementById("dab-" + dab_num);
link.scrollIntoView();
link.classList.add("disambig-highlight")
return false;
}
function open_dab(dab_num) {
var highlight_title = "text-bg-primary"; var highlight_title = "text-bg-primary";
var dab_articles = document.getElementsByClassName("dab-article");
for(var i=0; i<dab_articles.length; i++) {
dab_articles[i].classList.add("d-none");
}
var dab_article = document.getElementById("dab-article-" + dab_num);
dab_article.classList.remove("d-none");
var links = document.getElementsByTagName("a"); var links = document.getElementsByTagName("a");
for(var i=0; i<links.length; i++) { for(var i=0; i<links.length; i++) {
links[i].classList.remove("disambig-highlight"); links[i].classList.remove("disambig-highlight");
@ -104,9 +133,7 @@ a.new { color: red; }
card_title.classList.add(highlight_title); card_title.classList.add(highlight_title);
var link = document.getElementById("dab-" + dab_num); var link = document.getElementById("dab-" + dab_num);
link.scrollIntoView();
link.classList.add("disambig-highlight") link.classList.add("disambig-highlight")
return false;
} }
function clear_dab_highlight(dab_num) { function clear_dab_highlight(dab_num) {
@ -131,7 +158,8 @@ a.new { color: red; }
} }
function update_edits() { function update_edits() {
var saves = dab_order.filter(t => edits[t]).map(t => [t, edits[t]]); var saves = dab_link_to.map((link_to, num) => (
{"num": num, "link_to": link_to, "title": edits[num]}));
var save_edits = document.getElementById("save-edits"); var save_edits = document.getElementById("save-edits");
save_edits.value = JSON.stringify(saves); save_edits.value = JSON.stringify(saves);
} }
@ -141,7 +169,7 @@ a.new { color: red; }
document.getElementById("cancel-" + dab_num).classList.remove("d-none"); document.getElementById("cancel-" + dab_num).classList.remove("d-none");
var title = element.getAttribute("title"); var title = element.getAttribute("title");
edits[dab_lookup[dab_num]] = title; edits[dab_num] = title;
edit_set.add(dab_num); edit_set.add(dab_num);
update_edits(); update_edits();
@ -163,7 +191,7 @@ a.new { color: red; }
} }
function cancel_selection(dab_num) { function cancel_selection(dab_num) {
delete edits[dab_lookup[dab_num]]; delete edits[dab_num];
document.getElementById("cancel-" + dab_num).classList.add("d-none"); document.getElementById("cancel-" + dab_num).classList.add("d-none");
clear_dab_highlight(dab_num); clear_dab_highlight(dab_num);
edit_set.delete(dab_num); edit_set.delete(dab_num);

View file

@ -1,7 +1,21 @@
{% extends "base.html" %} {% extends "base.html" %}
{% block title %}DAB Mechanic{% endblock %}
{% block content %} {% block content %}
<div class="m-3"> <div class="m-3">
<form>
article title:
<input name="title" value="{{ request.args.get("title", "") }}">
<button class="btn btn-sm btn-primary">go</button>
</form>
{% if title and not exists %}
<p>No article titled "{{ title }}" found in Wikipedia.</p>
{% endif %}
<ol> <ol>
{% for enwiki, count in articles %} {% for enwiki, count in articles %}
<li> <li>

View file

@ -15,13 +15,7 @@
<a class="navbar-brand" href="{{ url_for('index') }}">Dab Mechanic</a> <a class="navbar-brand" href="{{ url_for('index') }}">Dab Mechanic</a>
<ul class="navbar-nav me-auto mb-2 mb-lg-0"> <ul class="navbar-nav me-auto mb-2 mb-lg-0">
<li class="nav-item"> <li class="nav-item">
<a class="nav-link active" aria-current="page" href="#">Home</a> <a class="nav-link active" aria-current="page" href="/">Home</a>
</li>
<li class="nav-item">
<a class="nav-link" href="#">Link</a>
</li>
<li class="nav-item">
<a class="nav-link disabled">Disabled</a>
</li> </li>
</ul> </ul>

39
templates/preview.html Normal file
View file

@ -0,0 +1,39 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>{{ title }} &ndash; dab mechanic</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.2.0/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-gH2yIJqKdNHPEq0n4Mqa/HGKIhSkIHeL5AyhkYV8i59U5AR6csBvApHHNl/vI1Bx" crossorigin="anonymous">
<link rel="stylesheet" href="https://www.mediawiki.org/w/load.php?modules=mediawiki.diff.styles&only=styles">
</head>
<body>
<div class="m-3">
<h2>Preview of changes: {{ title }}</h2>
<div class="card">
<div class="card-body">
<h5 class="card-title">Edit summary</h5>
<p class="card-text">{{ edit_summary }}</p>
</div>
</div>
{# <pre>{{ text }}</pre> #}
<table class="diff my-3">
<colgroup>
<col class="diff-marker">
<col class="diff-content">
<col class="diff-marker">
<col class="diff-content">
</colgroup>
<tbody>
{{ diff | safe }}
</tbody>
</table>
<form method="POST" action="{{ url_for("save", enwiki=title) }}">
<button class="btn btn-primary" id="save-btn">Save changes</button>
<input type="hidden" value="{{ request.form.edits }}" id="save-edits" name="edits">
</form>
</body>
</html>

View file

@ -1,18 +0,0 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>{{ title }} &ndash; dab mechanic</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.2.0/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-gH2yIJqKdNHPEq0n4Mqa/HGKIhSkIHeL5AyhkYV8i59U5AR6csBvApHHNl/vI1Bx" crossorigin="anonymous">
</head>
<body>
<div class="m-3">
<h2>Save edits: {{ title }}</h2>
<p>Edit summary: {{ edit_summary }}</p>
</div>
<div class="m-3">
<pre>{{ text }}</pre>
</div>
</body>
</html>

View file

@ -5,10 +5,11 @@ import json
import re import re
import sys import sys
import traceback import traceback
from typing import Optional from typing import Optional, TypedDict
import flask import flask
import lxml.html import lxml.html
import mwparserfromhell
import requests import requests
import werkzeug.exceptions import werkzeug.exceptions
from requests_oauthlib import OAuth1Session from requests_oauthlib import OAuth1Session
@ -77,30 +78,48 @@ def parse_articles_with_dab_links(root: lxml.html.HtmlElement) -> list[tuple[str
@app.route("/") @app.route("/")
def index() -> str: def index() -> str | Response:
"""Index page.""" """Index page."""
title = flask.request.args.get("title")
exists = None
if title:
title = title.strip()
exists = mediawiki_api.article_exists(title)
if exists:
return flask.redirect(
flask.url_for("article_page", enwiki=title.replace(" ", "_"))
)
r = requests.get(awdl_url, params={"limit": 100}) r = requests.get(awdl_url, params={"limit": 100})
root = lxml.html.fromstring(r.content) root = lxml.html.fromstring(r.content)
articles = parse_articles_with_dab_links(root) articles = parse_articles_with_dab_links(root)
# articles = [line[:-1] for line in open("article_list")] # articles = [line[:-1] for line in open("article_list")]
return flask.render_template("index.html", articles=articles) return flask.render_template(
"index.html",
title=title,
exists=exists,
articles=articles,
)
def make_disamb_link(edit: tuple[str, str]) -> str: class Edit(TypedDict):
"""Given an edit return the appropriate link.""" """Edit to an article."""
return f"[[{edit[1]}|{edit[0]}]]"
num: int
link_to: str
title: str
def apply_edits(article_text: str, edits: list[tuple[str, str]]) -> str: def old_apply_edits(article_text: str, edits: list[Edit]) -> str:
"""Apply edits to article text.""" """Apply edits to article text."""
def escape(s: str) -> str: def escape(s: str) -> str:
return re.escape(s).replace("_", "[ _]").replace(r"\ ", "[ _]") return re.escape(s).replace("_", "[ _]").replace(r"\ ", "[ _]")
for link_from, link_to in edits: for edit in edits:
print(rf"\[\[{escape(link_from)}\]\]") # print(rf"\[\[{escape(link_from)}\]\]")
article_text = re.sub( article_text = re.sub(
rf"\[\[{escape(link_from)}\]\]", rf"\[\[{escape(link_from)}\]\]",
f"[[{link_to}|{link_from}]]", f"[[{link_to}|{link_from}]]",
@ -110,34 +129,107 @@ def apply_edits(article_text: str, edits: list[tuple[str, str]]) -> str:
return article_text return article_text
@app.route("/save/<path:enwiki>", methods=["POST"]) def make_disamb_link(edit: Edit) -> str:
def save(enwiki: str) -> Response | str: """Given an edit return the appropriate link."""
"""Save edits to article.""" return f"[[{edit['title']}|{edit['link_to']}]]"
edits = [
(link_to, link_from)
for link_to, link_from in json.loads(flask.request.form["edits"])
]
enwiki = enwiki.replace("_", " ")
def build_edit_summary(edits: list[Edit]) -> str:
"""Given a list of edits return an edit summary."""
titles = ", ".join(make_disamb_link(edit) for edit in edits[:-1]) titles = ", ".join(make_disamb_link(edit) for edit in edits[:-1])
if len(titles) > 1: if len(titles) > 1:
titles += " and " titles += " and "
titles += make_disamb_link(edits[-1]) titles += make_disamb_link(edits[-1])
edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]" return f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]"
article_text = apply_edits(mediawiki_api.get_content(enwiki), edits)
def get_links(wikicode, dab_links):
edits = [edit for edit in dab_links if edit.get("title")]
dab_titles = {dab["link_to"] for dab in edits}
return [
link for link in wikicode.filter_wikilinks() if str(link.title) in dab_titles
]
def apply_edits(text, dab_links):
wikicode = mwparserfromhell.parse(text)
links = get_links(wikicode, dab_links)
if len(links) != len(dab_links):
print("links:", len(links))
print("dab_links:", len(dab_links))
print("dab_links:", dab_links)
assert len(links) == len(dab_links)
for wikilink, edit in zip(links, dab_links):
if not edit.get("title"):
continue
if not wikilink.text:
wikilink.text = wikilink.title
wikilink.title = edit["title"]
return str(wikicode)
@app.route("/preview/<path:enwiki>", methods=["POST"])
def preview(enwiki: str) -> Response | str:
"""Preview article edits."""
enwiki = enwiki.replace("_", " ")
dab_links = json.loads(flask.request.form["edits"])
dab_links = [link for link in dab_links if "title" in link]
cur_text, baserevid = mediawiki_api.get_content(enwiki)
text = apply_edits(cur_text, dab_links)
diff = mediawiki_api.compare(enwiki, text)
return flask.render_template( return flask.render_template(
"save.html", "preview.html",
edit_summary=edit_summary, edit_summary=build_edit_summary(dab_links),
title=enwiki, title=enwiki,
edits=edits, edits=dab_links,
text=article_text, diff=diff,
) )
def do_save(enwiki: str):
"""Update page on Wikipedia."""
dab_links = json.loads(flask.request.form["edits"])
dab_links = [link for link in dab_links if "title" in link]
cur_text, baserevid = mediawiki_api.get_content(enwiki)
new_text = apply_edits(cur_text, dab_links)
token = wikidata_oauth.get_token()
summary = build_edit_summary(dab_links)
print(summary)
edit = mediawiki_api.edit_page(
title=enwiki,
text=new_text,
summary=summary,
baserevid=baserevid,
token=token,
)
return edit
@app.route("/save/<path:enwiki>", methods=["GET", "POST"])
def save(enwiki: str) -> Response | str:
"""Save edits to article."""
enwiki_norm = enwiki.replace("_", " ")
if flask.request.method == "GET":
return flask.render_template("edit_saved.html", title=enwiki_norm)
do_save(enwiki_norm)
return flask.redirect(flask.url_for(flask.request.endpoint, enwiki=enwiki))
def redirect_if_needed(enwiki: str) -> Optional[Response]: def redirect_if_needed(enwiki: str) -> Optional[Response]:
"""Check if there are spaces in the article name and redirect.""" """Check if there are spaces in the article name and redirect."""
endpoint = flask.request.endpoint endpoint = flask.request.endpoint
@ -156,6 +248,9 @@ def article_page(enwiki: str) -> Response | str:
if redirect: if redirect:
return redirect return redirect
if "owner_key" not in flask.session:
return flask.render_template("login_needed.html")
article = wikipedia.Article(enwiki) article = wikipedia.Article(enwiki)
article.load() article.load()
article.process_links() article.process_links()