Compare commits

..

No commits in common. "b6953cf52f1186770b7b03eacf07eb9d780edd81" and "4d175c8733b043a51f35d09cf11a4d92de34b498" have entirely different histories.

9 changed files with 85 additions and 291 deletions

View file

@ -30,62 +30,16 @@ def call(params: dict[str, str | int]) -> dict[str, Any]:
return data.json() return data.json()
def article_exists(title: str) -> bool: def get_content(title: str) -> str:
"""Get article text."""
params: dict[str, str | int] = {
"action": "query",
"format": "json",
"formatversion": 2,
"titles": title,
}
return not call(params)["query"]["pages"][0].get("missing")
def get_content(title: str) -> tuple[str, int]:
"""Get article text.""" """Get article text."""
params: dict[str, str | int] = { params: dict[str, str | int] = {
"action": "query", "action": "query",
"format": "json", "format": "json",
"formatversion": 2, "formatversion": 2,
"prop": "revisions|info", "prop": "revisions|info",
"rvprop": "content|timestamp|ids", "rvprop": "content|timestamp",
"titles": title, "titles": title,
} }
data = call(params) data = call(params)
rev = data["query"]["pages"][0]["revisions"][0] rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
content: str = rev["content"] return rev
revid: int = int(rev["revid"])
return content, revid
def compare(title: str, new_text: str) -> str:
"""Generate a diff for the new article text."""
params: dict[str, str | int] = {
"format": "json",
"formatversion": 2,
"action": "compare",
"fromtitle": title,
"toslots": "main",
"totext-main": new_text,
"prop": "diff",
}
diff: str = call(params)["compare"]["body"]
return diff
def edit_page(
title: str, text: str, summary: str, baserevid: str, token: str
) -> dict[str, str | int]:
"""Edit a page on Wikipedia."""
params: dict[str, str | int] = {
"format": "json",
"formatversion": 2,
"action": "edit",
"title": title,
"text": text,
"baserevid": baserevid,
"token": token,
"summary": summary,
}
edit: str = call(params)["edit"]
return edit

View file

@ -3,10 +3,8 @@ from urllib.parse import urlencode
from flask import current_app, session from flask import current_app, session
from requests_oauthlib import OAuth1Session from requests_oauthlib import OAuth1Session
WIKI_HOSTNAME = "en.wikipedia.org" wiki_hostname = "en.wikipedia.org"
API_URL = f"https://{WIKI_HOSTNAME}/w/api.php" api_url = f"https://{wiki_hostname}/w/api.php"
TIMEOUT = 20
def get_edit_proxy() -> dict[str, str]: def get_edit_proxy() -> dict[str, str]:
@ -30,12 +28,12 @@ def api_post_request(params: dict[str, str | int]):
resource_owner_secret=session["owner_secret"], resource_owner_secret=session["owner_secret"],
) )
proxies = get_edit_proxy() proxies = get_edit_proxy()
return oauth.post(API_URL, data=params, timeout=TIMEOUT, proxies=proxies) return oauth.post(api_url, data=params, timeout=10, proxies=proxies)
def raw_request(params): def raw_request(params):
app = current_app app = current_app
url = API_URL + "?" + urlencode(params) url = api_url + "?" + urlencode(params)
client_key = app.config["CLIENT_KEY"] client_key = app.config["CLIENT_KEY"]
client_secret = app.config["CLIENT_SECRET"] client_secret = app.config["CLIENT_SECRET"]
oauth = OAuth1Session( oauth = OAuth1Session(
@ -45,7 +43,7 @@ def raw_request(params):
resource_owner_secret=session["owner_secret"], resource_owner_secret=session["owner_secret"],
) )
proxies = get_edit_proxy() proxies = get_edit_proxy()
return oauth.get(url, timeout=TIMEOUT, proxies=proxies) return oauth.get(url, timeout=10, proxies=proxies)
def api_request(params): def api_request(params):

View file

@ -68,7 +68,7 @@ def needs_disambig(link: dict[str, Any]) -> bool:
) )
def get_article_links(enwiki: str) -> dict[str, str]: def get_article_links(enwiki: str) -> list[str]:
"""Get links that appear in this article.""" """Get links that appear in this article."""
params: dict[str, str | int] = link_params(enwiki) params: dict[str, str | int] = link_params(enwiki)
@ -92,13 +92,11 @@ def get_article_links(enwiki: str) -> dict[str, str]:
params["gplcontinue"] = data["continue"]["gplcontinue"] params["gplcontinue"] = data["continue"]["gplcontinue"]
sleep(0.1) sleep(0.1)
ret_links = {}
for link in set(links): for link in set(links):
ret_links[link] = link if link in redirects:
for r in redirects.get(link, []): links.update(redirects[link])
ret_links[r] = link
return ret_links return list(links)
# return {link["title"] for link in r.json()["query"]["pages"][0]["links"]} # return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
@ -123,9 +121,10 @@ def delete_toc(root: lxml.html.HtmlElement) -> None:
toc.getparent().remove(toc) toc.getparent().remove(toc)
def get_dab_html(dab_num: int, html: str) -> str: def get_dab_html(dab_num: int, title: str) -> str:
"""Parse dab page and rewrite links.""" """Parse dab page and rewrite links."""
root = lxml.html.fromstring(html) dab_html = get_article_html(title)
root = lxml.html.fromstring(dab_html)
delete_toc(root) delete_toc(root)
element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")} element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")}
@ -161,11 +160,10 @@ class Article:
self.dab_lookup: dict[int, str] = {} self.dab_lookup: dict[int, str] = {}
self.dab_order: list[str] = [] self.dab_order: list[str] = []
self.parse: Optional[dict[str, Any]] = None self.parse: Optional[dict[str, Any]] = None
self.dab_html: dict[str, str] = {}
def preview_endpoint(self) -> str: def save_endpoint(self) -> str:
"""Endpoint for saving changes.""" """Endpoint for saving changes."""
href: str = flask.url_for("preview", enwiki=self.enwiki.replace(" ", "_")) href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_"))
return href return href
def load(self) -> None: def load(self) -> None:
@ -175,34 +173,28 @@ class Article:
def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]: def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
"""Disambiguation links that need fixing.""" """Disambiguation links that need fixing."""
seen = set()
for a in self.root.findall(".//a[@href]"): for a in self.root.findall(".//a[@href]"):
title = a.get("title") title = a.get("title")
if title is not None and title in self.links: if title is None or title not in self.links:
yield a, title, self.links[title]
href = a.get("href")
if not href.startswith("/wiki/"):
continue continue
a.set("href", "https://en.wikipedia.org" + href) a.set("class", "disambig")
a.set("target", "_blank")
def dab_link_to(self): if title in seen:
return [dab["link_to"] for dab in self.dab_list] continue
seen.add(title)
yield a, title
def process_links(self) -> None: def process_links(self) -> None:
"""Process links in parsed wikitext.""" """Process links in parsed wikitext."""
for dab_num, (a, link_to, title) in enumerate(self.iter_links()): for dab_num, (a, title) in enumerate(self.iter_links()):
a.set("class", "disambig")
a.set("id", f"dab-{dab_num}") a.set("id", f"dab-{dab_num}")
if title not in self.dab_html:
self.dab_html[title] = get_article_html(title)
dab: DabItem = { dab: DabItem = {
"num": dab_num, "num": dab_num,
"title": title, "title": title,
"link_to": link_to, "html": get_dab_html(dab_num, title),
"html": get_dab_html(dab_num, self.dab_html[title]),
} }
self.dab_list.append(dab) self.dab_list.append(dab)
self.dab_order.append(title) self.dab_order.append(title)

View file

@ -53,8 +53,8 @@ a.new { color: red; }
<div id="dabs" class="p-3"> <div id="dabs" class="p-3">
<h1>{{ article.enwiki }}</h1> <h1>{{ article.enwiki }}</h1>
<div id="save-panel" class="d-none"> <div id="save-panel" class="d-none">
<form method="POST" action="{{ article.preview_endpoint() }}"> <form method="POST" action="{{ article.save_endpoint() }}">
<button class="btn btn-primary" id="save-btn">Preview before save</button> <button class="btn btn-primary" id="save-btn">Save</button>
<span id="edit-count"></span> <span id="edit-count"></span>
<input type="hidden" value="{}" id="save-edits" name="edits"> <input type="hidden" value="{}" id="save-edits" name="edits">
</form> </form>
@ -62,9 +62,7 @@ a.new { color: red; }
<div>There are {{ article.dab_list | count }} links in the article that need disambiguating.</div> <div>There are {{ article.dab_list | count }} links in the article that need disambiguating.</div>
{% for dab in article.dab_list %} {% for dab in article.dab_list %}
<div class="card p-1 m-2"> <div class="card p-1 m-2">
<div class="card-body">
<h3 class="card-title" id="dab-card-title-{{ dab.num }}" onclick="return jump_to({{ dab.num }})">{{ dab.title }}</h3> <h3 class="card-title" id="dab-card-title-{{ dab.num }}" onclick="return jump_to({{ dab.num }})">{{ dab.title }}</h3>
{% if dab.title != dab.link_to %}<div>redirect from {{ dab.link_to }}</div>{% endif %}
<div> <div>
<a href="#" onclick="return jump_to({{ dab.num }})">highlight link</a> <a href="#" onclick="return jump_to({{ dab.num }})">highlight link</a>
<span class="d-none" id="cancel-{{ dab.num }}"> <span class="d-none" id="cancel-{{ dab.num }}">
@ -72,8 +70,7 @@ a.new { color: red; }
<a href="#" onclick="return cancel_selection({{ dab.num }})">cancel selection</a> <a href="#" onclick="return cancel_selection({{ dab.num }})">cancel selection</a>
</span> </span>
</div> </div>
<div class="dab-article d-none" id="dab-article-{{ dab.num }}">{{ dab.html | safe }}</div> <div class="dab-article" id="dab-article-{{ dab.num }}">{{ dab.html | safe }}</div>
</div>
</div> </div>
{% endfor %} {% endfor %}
</div> </div>
@ -87,38 +84,12 @@ a.new { color: red; }
var edit_set = new Set(); var edit_set = new Set();
var edits = {}; var edits = {};
var dab_lookup = {{ article.dab_lookup | tojson }};
var dab_order = {{ article.dab_order | tojson }}; var dab_order = {{ article.dab_order | tojson }};
var dab_link_to = {{ article.dab_link_to() | tojson }};
var dab_links = document.getElementsByClassName("disambig");
for(var i=0; i<dab_links.length; i++) {
dab_links[i].addEventListener("click", (event) => {
event.preventDefault();
var dab_num = event.target.id.substring(4);
open_dab(dab_num);
});
}
function jump_to(dab_num) { function jump_to(dab_num) {
open_dab(dab_num);
var link = document.getElementById("dab-" + dab_num);
link.scrollIntoView();
link.classList.add("disambig-highlight")
return false;
}
function open_dab(dab_num) {
var highlight_title = "text-bg-primary"; var highlight_title = "text-bg-primary";
var dab_articles = document.getElementsByClassName("dab-article");
for(var i=0; i<dab_articles.length; i++) {
dab_articles[i].classList.add("d-none");
}
var dab_article = document.getElementById("dab-article-" + dab_num);
dab_article.classList.remove("d-none");
var links = document.getElementsByTagName("a"); var links = document.getElementsByTagName("a");
for(var i=0; i<links.length; i++) { for(var i=0; i<links.length; i++) {
links[i].classList.remove("disambig-highlight"); links[i].classList.remove("disambig-highlight");
@ -133,7 +104,9 @@ a.new { color: red; }
card_title.classList.add(highlight_title); card_title.classList.add(highlight_title);
var link = document.getElementById("dab-" + dab_num); var link = document.getElementById("dab-" + dab_num);
link.scrollIntoView();
link.classList.add("disambig-highlight") link.classList.add("disambig-highlight")
return false;
} }
function clear_dab_highlight(dab_num) { function clear_dab_highlight(dab_num) {
@ -158,8 +131,7 @@ a.new { color: red; }
} }
function update_edits() { function update_edits() {
var saves = dab_link_to.map((link_to, num) => ( var saves = dab_order.filter(t => edits[t]).map(t => [t, edits[t]]);
{"num": num, "link_to": link_to, "title": edits[num]}));
var save_edits = document.getElementById("save-edits"); var save_edits = document.getElementById("save-edits");
save_edits.value = JSON.stringify(saves); save_edits.value = JSON.stringify(saves);
} }
@ -169,7 +141,7 @@ a.new { color: red; }
document.getElementById("cancel-" + dab_num).classList.remove("d-none"); document.getElementById("cancel-" + dab_num).classList.remove("d-none");
var title = element.getAttribute("title"); var title = element.getAttribute("title");
edits[dab_num] = title; edits[dab_lookup[dab_num]] = title;
edit_set.add(dab_num); edit_set.add(dab_num);
update_edits(); update_edits();
@ -191,7 +163,7 @@ a.new { color: red; }
} }
function cancel_selection(dab_num) { function cancel_selection(dab_num) {
delete edits[dab_num]; delete edits[dab_lookup[dab_num]];
document.getElementById("cancel-" + dab_num).classList.add("d-none"); document.getElementById("cancel-" + dab_num).classList.add("d-none");
clear_dab_highlight(dab_num); clear_dab_highlight(dab_num);
edit_set.delete(dab_num); edit_set.delete(dab_num);

View file

@ -1,21 +1,7 @@
{% extends "base.html" %} {% extends "base.html" %}
{% block title %}DAB Mechanic{% endblock %}
{% block content %} {% block content %}
<div class="m-3"> <div class="m-3">
<form>
article title:
<input name="title" value="{{ request.args.get("title", "") }}">
<button class="btn btn-sm btn-primary">go</button>
</form>
{% if title and not exists %}
<p>No article titled "{{ title }}" found in Wikipedia.</p>
{% endif %}
<ol> <ol>
{% for enwiki, count in articles %} {% for enwiki, count in articles %}
<li> <li>

View file

@ -15,7 +15,13 @@
<a class="navbar-brand" href="{{ url_for('index') }}">Dab Mechanic</a> <a class="navbar-brand" href="{{ url_for('index') }}">Dab Mechanic</a>
<ul class="navbar-nav me-auto mb-2 mb-lg-0"> <ul class="navbar-nav me-auto mb-2 mb-lg-0">
<li class="nav-item"> <li class="nav-item">
<a class="nav-link active" aria-current="page" href="/">Home</a> <a class="nav-link active" aria-current="page" href="#">Home</a>
</li>
<li class="nav-item">
<a class="nav-link" href="#">Link</a>
</li>
<li class="nav-item">
<a class="nav-link disabled">Disabled</a>
</li> </li>
</ul> </ul>

View file

@ -1,39 +0,0 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>{{ title }} &ndash; dab mechanic</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.2.0/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-gH2yIJqKdNHPEq0n4Mqa/HGKIhSkIHeL5AyhkYV8i59U5AR6csBvApHHNl/vI1Bx" crossorigin="anonymous">
<link rel="stylesheet" href="https://www.mediawiki.org/w/load.php?modules=mediawiki.diff.styles&only=styles">
</head>
<body>
<div class="m-3">
<h2>Preview of changes: {{ title }}</h2>
<div class="card">
<div class="card-body">
<h5 class="card-title">Edit summary</h5>
<p class="card-text">{{ edit_summary }}</p>
</div>
</div>
{# <pre>{{ text }}</pre> #}
<table class="diff my-3">
<colgroup>
<col class="diff-marker">
<col class="diff-content">
<col class="diff-marker">
<col class="diff-content">
</colgroup>
<tbody>
{{ diff | safe }}
</tbody>
</table>
<form method="POST" action="{{ url_for("save", enwiki=title) }}">
<button class="btn btn-primary" id="save-btn">Save changes</button>
<input type="hidden" value="{{ request.form.edits }}" id="save-edits" name="edits">
</form>
</body>
</html>

18
templates/save.html Normal file
View file

@ -0,0 +1,18 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>{{ title }} &ndash; dab mechanic</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.2.0/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-gH2yIJqKdNHPEq0n4Mqa/HGKIhSkIHeL5AyhkYV8i59U5AR6csBvApHHNl/vI1Bx" crossorigin="anonymous">
</head>
<body>
<div class="m-3">
<h2>Save edits: {{ title }}</h2>
<p>Edit summary: {{ edit_summary }}</p>
</div>
<div class="m-3">
<pre>{{ text }}</pre>
</div>
</body>
</html>

View file

@ -3,9 +3,7 @@
import inspect import inspect
import json import json
import re import re
from typing import Optional, TypedDict from typing import Optional
import mwparserfromhell
from pprint import pprint
import flask import flask
import lxml.html import lxml.html
@ -66,43 +64,28 @@ def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, in
@app.route("/") @app.route("/")
def index(): def index():
title = flask.request.args.get("title")
exists = None
if title:
title = title.strip()
exists = mediawiki_api.article_exists(title)
if exists:
return flask.redirect(
flask.url_for("article_page", enwiki=title.replace(" ", "_"))
)
r = requests.get(awdl_url, params={"limit": 100}) r = requests.get(awdl_url, params={"limit": 100})
root = lxml.html.fromstring(r.content) root = lxml.html.fromstring(r.content)
articles = parse_articles_with_dab_links(root) articles = parse_articles_with_dab_links(root)
# articles = [line[:-1] for line in open("article_list")] # articles = [line[:-1] for line in open("article_list")]
return flask.render_template( return flask.render_template("index.html", articles=articles)
"index.html", title=title, exists=exists, articles=articles,
)
class Edit(TypedDict): def make_disamb_link(edit: tuple[str, str]) -> str:
"""Edit to an article.""" """Given an edit return the appropriate link."""
return f"[[{edit[1]}|{edit[0]}]]"
num: int
link_to: str
title: str
def old_apply_edits(article_text: str, edits: list[Edit]) -> str: def apply_edits(article_text: str, edits: list[tuple[str, str]]) -> str:
"""Apply edits to article text.""" """Apply edits to article text."""
def escape(s: str) -> str: def escape(s: str) -> str:
return re.escape(s).replace("_", "[ _]").replace(r"\ ", "[ _]") return re.escape(s).replace("_", "[ _]").replace(r"\ ", "[ _]")
for edit in edits: for link_from, link_to in edits:
# print(rf"\[\[{escape(link_from)}\]\]") print(rf"\[\[{escape(link_from)}\]\]")
article_text = re.sub( article_text = re.sub(
rf"\[\[{escape(link_from)}\]\]", rf"\[\[{escape(link_from)}\]\]",
f"[[{link_to}|{link_from}]]", f"[[{link_to}|{link_from}]]",
@ -112,107 +95,34 @@ def old_apply_edits(article_text: str, edits: list[Edit]) -> str:
return article_text return article_text
def make_disamb_link(edit: Edit) -> str: @app.route("/save/<path:enwiki>", methods=["POST"])
"""Given an edit return the appropriate link.""" def save(enwiki: str) -> Response | str:
return f"[[{edit['title']}|{edit['link_to']}]]" """Save edits to article."""
edits = [
(link_to, link_from)
for link_to, link_from in json.loads(flask.request.form["edits"])
]
enwiki = enwiki.replace("_", " ")
def build_edit_summary(edits: list[Edit]) -> str:
"""Given a list of edits return an edit summary."""
titles = ", ".join(make_disamb_link(edit) for edit in edits[:-1]) titles = ", ".join(make_disamb_link(edit) for edit in edits[:-1])
if len(titles) > 1: if len(titles) > 1:
titles += " and " titles += " and "
titles += make_disamb_link(edits[-1]) titles += make_disamb_link(edits[-1])
return f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]" edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]"
article_text = apply_edits(mediawiki_api.get_content(enwiki), edits)
def get_links(wikicode, dab_links):
edits = [edit for edit in dab_links if edit.get("title")]
dab_titles = {dab["link_to"] for dab in edits}
return [
link for link in wikicode.filter_wikilinks() if str(link.title) in dab_titles
]
def apply_edits(text, dab_links):
wikicode = mwparserfromhell.parse(text)
links = get_links(wikicode, dab_links)
if len(links) != len(dab_links):
print("links:", len(links))
print("dab_links:", len(dab_links))
print("dab_links:", dab_links)
assert len(links) == len(dab_links)
for wikilink, edit in zip(links, dab_links):
if not edit.get("title"):
continue
if not wikilink.text:
wikilink.text = wikilink.title
wikilink.title = edit["title"]
return str(wikicode)
@app.route("/preview/<path:enwiki>", methods=["POST"])
def preview(enwiki: str) -> Response | str:
"""Preview article edits."""
enwiki = enwiki.replace("_", " ")
dab_links = json.loads(flask.request.form["edits"])
dab_links = [link for link in dab_links if "title" in link]
cur_text, baserevid = mediawiki_api.get_content(enwiki)
text = apply_edits(cur_text, dab_links)
diff = mediawiki_api.compare(enwiki, text)
return flask.render_template( return flask.render_template(
"preview.html", "save.html",
edit_summary=build_edit_summary(dab_links), edit_summary=edit_summary,
title=enwiki, title=enwiki,
edits=dab_links, edits=edits,
diff=diff, text=article_text,
) )
def do_save(enwiki: str):
"""Update page on Wikipedia."""
dab_links = json.loads(flask.request.form["edits"])
dab_links = [link for link in dab_links if "title" in link]
cur_text, baserevid = mediawiki_api.get_content(enwiki)
new_text = apply_edits(cur_text, dab_links)
token = wikidata_oauth.get_token()
summary = build_edit_summary(dab_links)
print(summary)
edit = mediawiki_api.edit_page(
title=enwiki,
text=new_text,
summary=summary,
baserevid=baserevid,
token=token,
)
return edit
@app.route("/save/<path:enwiki>", methods=["GET", "POST"])
def save(enwiki: str) -> Response | str:
"""Save edits to article."""
enwiki_norm = enwiki.replace("_", " ")
if flask.request.method == "GET":
return flask.render_template("edit_saved.html", title=enwiki_norm)
do_save(enwiki_norm)
return flask.redirect(flask.url_for(flask.request.endpoint, enwiki=enwiki))
def redirect_if_needed(enwiki: str) -> Optional[Response]: def redirect_if_needed(enwiki: str) -> Optional[Response]:
"""Check if there are spaces in the article name and redirect.""" """Check if there are spaces in the article name and redirect."""
return ( return (
@ -231,9 +141,6 @@ def article_page(enwiki: str) -> Response:
if redirect: if redirect:
return redirect return redirect
if "owner_key" not in flask.session:
return flask.render_template("login_needed.html")
article = wikipedia.Article(enwiki) article = wikipedia.Article(enwiki)
article.load() article.load()
article.process_links() article.process_links()