Compare commits
No commits in common. "4d175c8733b043a51f35d09cf11a4d92de34b498" and "d499c896b442bfdde097445162ba0697f516260d" have entirely different histories.
4d175c8733
...
d499c896b4
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1 +0,0 @@
|
|||
__pycache__
|
50
article_list
Normal file
50
article_list
Normal file
|
@ -0,0 +1,50 @@
|
|||
Rail transport in Indonesia
|
||||
SchleFaZ
|
||||
Chicago Bulls
|
||||
Orwell Prize
|
||||
List of fatal victims of the September 11 attacks
|
||||
Arabic exonyms
|
||||
Canadian Alpine Ski Championships
|
||||
Method Man filmography
|
||||
Popular Union
|
||||
The Cantos
|
||||
Unisex name
|
||||
United States Alpine Ski Championships
|
||||
AS Kaloum Star
|
||||
Akademi Fantasia (season 1)
|
||||
Athletics at the 2022 Bolivarian Games
|
||||
I Love the 2000s
|
||||
Kununokuni
|
||||
List of Wisin & Yandel collaborations
|
||||
List of comics based on films
|
||||
List of programs broadcast by Asianet
|
||||
Urban Hymns
|
||||
1979 Sydney City FC season
|
||||
2007 in Spanish television
|
||||
2022 World Athletics U20 Championships – Men's 4 × 100 metres relay
|
||||
A2 autostrada (Poland)
|
||||
Black to the Future (TV series)
|
||||
Chandel (Rajput clan)
|
||||
County of Isenburg
|
||||
Dinka people
|
||||
Dwayne McDuffie Award for Diversity in Comics
|
||||
FTSE Italia Mid Cap
|
||||
Globoplay
|
||||
Index of Armenia-related articles
|
||||
List of Equinox episodes
|
||||
List of Indian monarchs
|
||||
List of Italian exonyms in Dalmatia
|
||||
List of Ultimate Marvel characters
|
||||
List of cities with historical German exonyms
|
||||
List of jötnar in Norse mythology
|
||||
List of language families
|
||||
List of people with surname Davis
|
||||
List of political parties in Venezuela
|
||||
List of programmes broadcast by HTV
|
||||
Paul (given name)
|
||||
Principality of Lippe
|
||||
Propaganda in Russia
|
||||
Qazi Ghulam Mustafa
|
||||
Redfern Now
|
||||
Roy Orbison/The Beatles Tour
|
||||
Royal Birmingham Conservatoire
|
|
@ -1,45 +0,0 @@
|
|||
"""Interface with the mediawiki API."""
|
||||
|
||||
from typing import Any
|
||||
from . import wikidata_oauth
|
||||
|
||||
wiki_hostname = "en.wikipedia.org"
|
||||
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
|
||||
user_agent = "dab-mechanic/0.1"
|
||||
|
||||
|
||||
def parse_page(enwiki: str) -> dict[str, Any]:
|
||||
"""Call mediawiki parse API for given article."""
|
||||
params: dict[str, str | int] = {
|
||||
"action": "parse",
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"disableeditsection": 1,
|
||||
"page": enwiki,
|
||||
"prop": "text|links|headhtml",
|
||||
"disabletoc": 1,
|
||||
}
|
||||
|
||||
parse: dict[str, Any] = call(params)["parse"]
|
||||
return parse
|
||||
|
||||
|
||||
def call(params: dict[str, str | int]) -> dict[str, Any]:
|
||||
"""Make GET request to mediawiki API."""
|
||||
data: dict[str, Any] = wikidata_oauth.api_post_request(params)
|
||||
return data.json()
|
||||
|
||||
|
||||
def get_content(title: str) -> str:
|
||||
"""Get article text."""
|
||||
params: dict[str, str | int] = {
|
||||
"action": "query",
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"prop": "revisions|info",
|
||||
"rvprop": "content|timestamp",
|
||||
"titles": title,
|
||||
}
|
||||
data = call(params)
|
||||
rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
|
||||
return rev
|
|
@ -19,6 +19,7 @@ def get_edit_proxy() -> dict[str, str]:
|
|||
def api_post_request(params: dict[str, str | int]):
|
||||
"""HTTP Post using Oauth."""
|
||||
app = current_app
|
||||
url = "https://www.wikidata.org/w/api.php"
|
||||
client_key = app.config["CLIENT_KEY"]
|
||||
client_secret = app.config["CLIENT_SECRET"]
|
||||
oauth = OAuth1Session(
|
||||
|
@ -28,12 +29,12 @@ def api_post_request(params: dict[str, str | int]):
|
|||
resource_owner_secret=session["owner_secret"],
|
||||
)
|
||||
proxies = get_edit_proxy()
|
||||
return oauth.post(api_url, data=params, timeout=10, proxies=proxies)
|
||||
return oauth.post(url, data=params, timeout=4, proxies=proxies)
|
||||
|
||||
|
||||
def raw_request(params):
|
||||
app = current_app
|
||||
url = api_url + "?" + urlencode(params)
|
||||
url = "https://www.wikidata.org/w/api.php?" + urlencode(params)
|
||||
client_key = app.config["CLIENT_KEY"]
|
||||
client_secret = app.config["CLIENT_SECRET"]
|
||||
oauth = OAuth1Session(
|
||||
|
@ -43,7 +44,7 @@ def raw_request(params):
|
|||
resource_owner_secret=session["owner_secret"],
|
||||
)
|
||||
proxies = get_edit_proxy()
|
||||
return oauth.get(url, timeout=10, proxies=proxies)
|
||||
return oauth.get(url, timeout=4, proxies=proxies)
|
||||
|
||||
|
||||
def api_request(params):
|
||||
|
|
|
@ -1,206 +0,0 @@
|
|||
from collections import defaultdict
|
||||
from typing import Any, Iterator, Optional, TypedDict
|
||||
|
||||
import flask
|
||||
import lxml.html
|
||||
|
||||
from . import mediawiki_api
|
||||
from pprint import pprint
|
||||
from time import sleep
|
||||
|
||||
disambig_templates = [
|
||||
"Template:Disambiguation",
|
||||
"Template:Airport disambiguation",
|
||||
"Template:Biology disambiguation",
|
||||
"Template:Call sign disambiguation",
|
||||
"Template:Caselaw disambiguation",
|
||||
"Template:Chinese title disambiguation",
|
||||
"Template:Disambiguation cleanup",
|
||||
"Template:Genus disambiguation",
|
||||
"Template:Hospital disambiguation",
|
||||
"Template:Human name disambiguation",
|
||||
"Template:Human name disambiguation cleanup",
|
||||
"Template:Letter-number combination disambiguation",
|
||||
"Template:Mathematical disambiguation",
|
||||
"Template:Military unit disambiguation",
|
||||
"Template:Music disambiguation",
|
||||
"Template:Number disambiguation",
|
||||
"Template:Opus number disambiguation",
|
||||
"Template:Phonetics disambiguation",
|
||||
"Template:Place name disambiguation",
|
||||
"Template:Portal disambiguation",
|
||||
"Template:Road disambiguation",
|
||||
"Template:School disambiguation",
|
||||
"Template:Species Latin name abbreviation disambiguation",
|
||||
"Template:Species Latin name disambiguation",
|
||||
"Template:Station disambiguation",
|
||||
"Template:Synagogue disambiguation",
|
||||
"Template:Taxonomic authority disambiguation",
|
||||
"Template:Taxonomy disambiguation",
|
||||
"Template:Template disambiguation",
|
||||
"Template:WoO number disambiguation",
|
||||
]
|
||||
|
||||
|
||||
def link_params(enwiki: str) -> dict[str, str | int]:
|
||||
"""Parameters for finding article links from the API."""
|
||||
params: dict[str, str | int] = {
|
||||
"action": "query",
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"titles": enwiki,
|
||||
"generator": "links",
|
||||
"gpllimit": "max",
|
||||
"gplnamespace": 0,
|
||||
"tllimit": "max",
|
||||
"redirects": 1,
|
||||
"tlnamespace": 10,
|
||||
"tltemplates": "|".join(disambig_templates),
|
||||
"prop": "templates",
|
||||
}
|
||||
return params
|
||||
|
||||
|
||||
def needs_disambig(link: dict[str, Any]) -> bool:
|
||||
"""Is this a disambiguation link."""
|
||||
return bool(
|
||||
not link["title"].endswith(" (disambiguation)") and link.get("templates")
|
||||
)
|
||||
|
||||
|
||||
def get_article_links(enwiki: str) -> list[str]:
|
||||
"""Get links that appear in this article."""
|
||||
|
||||
params: dict[str, str | int] = link_params(enwiki)
|
||||
links: set[str] = set()
|
||||
|
||||
redirects = defaultdict(set)
|
||||
|
||||
while True:
|
||||
data = mediawiki_api.call(params)
|
||||
if "query" not in data:
|
||||
pprint(data)
|
||||
pages = data["query"].pop("pages")
|
||||
for r in data["query"].pop("redirects"):
|
||||
redirects[r["to"]].add(r["from"])
|
||||
|
||||
links.update(page["title"] for page in pages if needs_disambig(page))
|
||||
|
||||
if "continue" not in data:
|
||||
break
|
||||
|
||||
params["gplcontinue"] = data["continue"]["gplcontinue"]
|
||||
sleep(0.1)
|
||||
|
||||
for link in set(links):
|
||||
if link in redirects:
|
||||
links.update(redirects[link])
|
||||
|
||||
return list(links)
|
||||
|
||||
# return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
|
||||
|
||||
|
||||
def get_article_html(enwiki: str) -> str:
|
||||
"""Parse article wikitext and return HTML."""
|
||||
text: str = mediawiki_api.parse_page(enwiki)["text"]
|
||||
return text
|
||||
|
||||
|
||||
class DabItem(TypedDict):
|
||||
"""Represent a disabiguation page."""
|
||||
|
||||
num: int
|
||||
title: str
|
||||
html: str
|
||||
|
||||
|
||||
def delete_toc(root: lxml.html.HtmlElement) -> None:
|
||||
"""Delete table of contents from article HTML."""
|
||||
for toc in root.findall(".//div[@class='toc']"):
|
||||
toc.getparent().remove(toc)
|
||||
|
||||
|
||||
def get_dab_html(dab_num: int, title: str) -> str:
|
||||
"""Parse dab page and rewrite links."""
|
||||
dab_html = get_article_html(title)
|
||||
root = lxml.html.fromstring(dab_html)
|
||||
delete_toc(root)
|
||||
|
||||
element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")}
|
||||
|
||||
for a in root.findall(".//a[@href]"):
|
||||
href: str | None = a.get("href")
|
||||
if not href:
|
||||
continue
|
||||
if not href.startswith("#"):
|
||||
a.set("href", "#")
|
||||
a.set("onclick", f"return select_dab(this, {dab_num})")
|
||||
continue
|
||||
|
||||
destination_element = element_id_map[href[1:]]
|
||||
assert destination_element is not None
|
||||
destination_element.set("id", f"{dab_num}{href[1:]}")
|
||||
a.set("href", f"#{dab_num}{href[1:]}")
|
||||
|
||||
html: str = lxml.html.tostring(root, encoding=str)
|
||||
return html
|
||||
|
||||
|
||||
class Article:
|
||||
"""Current article we're working on."""
|
||||
|
||||
def __init__(self, enwiki: str) -> None:
|
||||
"""Make a new Article object."""
|
||||
self.enwiki = enwiki.replace("_", " ")
|
||||
|
||||
self.links = get_article_links(enwiki)
|
||||
|
||||
self.dab_list: list[DabItem] = []
|
||||
self.dab_lookup: dict[int, str] = {}
|
||||
self.dab_order: list[str] = []
|
||||
self.parse: Optional[dict[str, Any]] = None
|
||||
|
||||
def save_endpoint(self) -> str:
|
||||
"""Endpoint for saving changes."""
|
||||
href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_"))
|
||||
return href
|
||||
|
||||
def load(self) -> None:
|
||||
"""Load parsed article HTML."""
|
||||
self.parse = mediawiki_api.parse_page(self.enwiki)
|
||||
self.root = lxml.html.fromstring(self.parse.pop("text"))
|
||||
|
||||
def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
|
||||
"""Disambiguation links that need fixing."""
|
||||
seen = set()
|
||||
for a in self.root.findall(".//a[@href]"):
|
||||
title = a.get("title")
|
||||
if title is None or title not in self.links:
|
||||
continue
|
||||
a.set("class", "disambig")
|
||||
|
||||
if title in seen:
|
||||
continue
|
||||
seen.add(title)
|
||||
|
||||
yield a, title
|
||||
|
||||
def process_links(self) -> None:
|
||||
"""Process links in parsed wikitext."""
|
||||
for dab_num, (a, title) in enumerate(self.iter_links()):
|
||||
a.set("id", f"dab-{dab_num}")
|
||||
|
||||
dab: DabItem = {
|
||||
"num": dab_num,
|
||||
"title": title,
|
||||
"html": get_dab_html(dab_num, title),
|
||||
}
|
||||
self.dab_list.append(dab)
|
||||
self.dab_order.append(title)
|
||||
self.dab_lookup[dab_num] = title
|
||||
|
||||
def get_html(self) -> str:
|
||||
"""Return the processed article HTML."""
|
||||
html: str = lxml.html.tostring(self.root, encoding=str)
|
||||
return html
|
|
@ -1,78 +0,0 @@
|
|||
div.debugger { text-align: left; padding: 12px; margin: auto;
|
||||
background-color: white; }
|
||||
div.detail { cursor: pointer; }
|
||||
div.detail p { margin: 0 0 8px 13px; font-size: 14px; white-space: pre-wrap;
|
||||
font-family: monospace; }
|
||||
div.explanation { margin: 20px 13px; font-size: 15px; color: #555; }
|
||||
div.footer { font-size: 13px; text-align: right; margin: 30px 0;
|
||||
color: #86989B; }
|
||||
|
||||
h2 { font-size: 16px; margin: 1.3em 0 0.0 0; padding: 9px;
|
||||
background-color: #11557C; color: white; }
|
||||
h2 em, h3 em { font-style: normal; color: #A5D6D9; font-weight: normal; }
|
||||
|
||||
div.traceback, div.plain { border: 1px solid #ddd; margin: 0 0 1em 0; padding: 10px; }
|
||||
div.plain p { margin: 0; }
|
||||
div.plain textarea,
|
||||
div.plain pre { margin: 10px 0 0 0; padding: 4px;
|
||||
background-color: #E8EFF0; border: 1px solid #D3E7E9; }
|
||||
div.plain textarea { width: 99%; height: 300px; }
|
||||
div.traceback h3 { font-size: 1em; margin: 0 0 0.8em 0; }
|
||||
div.traceback ul { list-style: none; margin: 0; padding: 0 0 0 1em; }
|
||||
div.traceback h4 { font-size: 13px; font-weight: normal; margin: 0.7em 0 0.1em 0; }
|
||||
div.traceback pre { margin: 0; padding: 5px 0 3px 15px;
|
||||
background-color: #E8EFF0; border: 1px solid #D3E7E9; }
|
||||
div.traceback .library .current { background: white; color: #555; }
|
||||
div.traceback .expanded .current { background: #E8EFF0; color: black; }
|
||||
div.traceback pre:hover { background-color: #DDECEE; color: black; cursor: pointer; }
|
||||
div.traceback div.source.expanded pre + pre { border-top: none; }
|
||||
|
||||
div.traceback span.ws { display: none; }
|
||||
div.traceback pre.before, div.traceback pre.after { display: none; background: white; }
|
||||
div.traceback div.source.expanded pre.before,
|
||||
div.traceback div.source.expanded pre.after {
|
||||
display: block;
|
||||
}
|
||||
|
||||
div.traceback div.source.expanded span.ws {
|
||||
display: inline;
|
||||
}
|
||||
|
||||
div.traceback blockquote { margin: 1em 0 0 0; padding: 0; white-space: pre-line; }
|
||||
div.traceback img { float: right; padding: 2px; margin: -3px 2px 0 0; display: none; }
|
||||
div.traceback img:hover { background-color: #ddd; cursor: pointer;
|
||||
border-color: #BFDDE0; }
|
||||
div.traceback pre:hover img { display: block; }
|
||||
div.traceback cite.filename { font-style: normal; color: #3B666B; }
|
||||
|
||||
pre.console { border: 1px solid #ccc; background: white!important;
|
||||
color: black; padding: 5px!important;
|
||||
margin: 3px 0 0 0!important; cursor: default!important;
|
||||
max-height: 400px; overflow: auto; }
|
||||
pre.console form { color: #555; }
|
||||
pre.console input { background-color: transparent; color: #555;
|
||||
width: 90%; font-family: 'Consolas', 'Deja Vu Sans Mono',
|
||||
'Bitstream Vera Sans Mono', monospace; font-size: 14px;
|
||||
border: none!important; }
|
||||
|
||||
span.string { color: #30799B; }
|
||||
span.number { color: #9C1A1C; }
|
||||
span.help { color: #3A7734; }
|
||||
span.object { color: #485F6E; }
|
||||
span.extended { opacity: 0.5; }
|
||||
span.extended:hover { opacity: 1; }
|
||||
a.toggle { text-decoration: none; background-repeat: no-repeat;
|
||||
background-position: center center;
|
||||
background-image: url(?__debugger__=yes&cmd=resource&f=more.png); }
|
||||
a.toggle:hover { background-color: #444; }
|
||||
a.open { background-image: url(?__debugger__=yes&cmd=resource&f=less.png); }
|
||||
|
||||
div.traceback pre, div.console pre {
|
||||
white-space: pre-wrap; /* css-3 should we be so lucky... */
|
||||
white-space: -moz-pre-wrap; /* Mozilla, since 1999 */
|
||||
white-space: -pre-wrap; /* Opera 4-6 ?? */
|
||||
white-space: -o-pre-wrap; /* Opera 7 ?? */
|
||||
word-wrap: break-word; /* Internet Explorer 5.5+ */
|
||||
_white-space: pre; /* IE only hack to re-specify in
|
||||
addition to word-wrap */
|
||||
}
|
|
@ -1,14 +1,12 @@
|
|||
{% extends "base.html" %}
|
||||
|
||||
{% block content %}
|
||||
<div class="m-3">
|
||||
<ol>
|
||||
<ul>
|
||||
{% for enwiki, count in articles %}
|
||||
<li>
|
||||
<a href="{{ url_for("article_page", enwiki=enwiki) }}">{{ enwiki }}
|
||||
({{ count }} links)
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ol>
|
||||
</div>
|
||||
</ul>
|
||||
{% endblock %}
|
||||
|
|
252
web_view.py
252
web_view.py
|
@ -3,7 +3,7 @@
|
|||
import inspect
|
||||
import json
|
||||
import re
|
||||
from typing import Optional
|
||||
from typing import Any, Iterator, Optional, TypedDict
|
||||
|
||||
import flask
|
||||
import lxml.html
|
||||
|
@ -13,17 +13,16 @@ from requests_oauthlib import OAuth1Session
|
|||
from werkzeug.debug.tbtools import get_current_traceback
|
||||
from werkzeug.wrappers import Response
|
||||
|
||||
from dab_mechanic import mediawiki_api, wikidata_oauth, wikipedia
|
||||
from dab_mechanic import wikidata_oauth
|
||||
|
||||
app = flask.Flask(__name__)
|
||||
app.config.from_object("config.default")
|
||||
app.debug = True
|
||||
|
||||
wiki_hostname = "en.wikipedia.org"
|
||||
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
|
||||
wiki_index_php = f"https://{wiki_hostname}/w/index.php"
|
||||
|
||||
awdl_url = "https://dplbot.toolforge.org/articles_with_dab_links.php"
|
||||
|
||||
|
||||
@app.before_request
|
||||
def global_user():
|
||||
|
@ -47,6 +46,21 @@ def exception_handler(e):
|
|||
)
|
||||
|
||||
|
||||
def get_content(title: str) -> str:
|
||||
"""Get article text."""
|
||||
params: dict[str, str | int] = {
|
||||
"action": "query",
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"prop": "revisions|info",
|
||||
"rvprop": "content|timestamp",
|
||||
"titles": title,
|
||||
}
|
||||
data = requests.get(wiki_api_php, params=params).json()
|
||||
rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
|
||||
return rev
|
||||
|
||||
|
||||
def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, int]]:
|
||||
"""Parse Articles With Multiple Dablinks."""
|
||||
articles = []
|
||||
|
@ -64,7 +78,8 @@ def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, in
|
|||
|
||||
@app.route("/")
|
||||
def index():
|
||||
r = requests.get(awdl_url, params={"limit": 100})
|
||||
|
||||
r = requests.get("https://dplbot.toolforge.org/articles_with_dab_links.php")
|
||||
root = lxml.html.fromstring(r.content)
|
||||
articles = parse_articles_with_dab_links(root)
|
||||
|
||||
|
@ -73,6 +88,145 @@ def index():
|
|||
return flask.render_template("index.html", articles=articles)
|
||||
|
||||
|
||||
def call_parse_api(enwiki: str) -> dict[str, Any]:
|
||||
"""Call mediawiki parse API for given article."""
|
||||
url = "https://en.wikipedia.org/w/api.php"
|
||||
|
||||
params: dict[str, str | int] = {
|
||||
"action": "parse",
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"disableeditsection": 1,
|
||||
"page": enwiki,
|
||||
"prop": "text|links|headhtml",
|
||||
"disabletoc": 1,
|
||||
}
|
||||
|
||||
r = requests.get(url, params=params)
|
||||
parse: dict[str, Any] = r.json()["parse"]
|
||||
return parse
|
||||
|
||||
|
||||
def get_article_html(enwiki: str) -> str:
|
||||
"""Parse article wikitext and return HTML."""
|
||||
text: str = call_parse_api(enwiki)["text"]
|
||||
return text
|
||||
|
||||
|
||||
disambig_templates = [
|
||||
"Template:Disambiguation",
|
||||
"Template:Airport disambiguation",
|
||||
"Template:Biology disambiguation",
|
||||
"Template:Call sign disambiguation",
|
||||
"Template:Caselaw disambiguation",
|
||||
"Template:Chinese title disambiguation",
|
||||
"Template:Disambiguation cleanup",
|
||||
"Template:Genus disambiguation",
|
||||
"Template:Hospital disambiguation",
|
||||
"Template:Human name disambiguation",
|
||||
"Template:Human name disambiguation cleanup",
|
||||
"Template:Letter-number combination disambiguation",
|
||||
"Template:Mathematical disambiguation",
|
||||
"Template:Military unit disambiguation",
|
||||
"Template:Music disambiguation",
|
||||
"Template:Number disambiguation",
|
||||
"Template:Opus number disambiguation",
|
||||
"Template:Phonetics disambiguation",
|
||||
"Template:Place name disambiguation",
|
||||
"Template:Portal disambiguation",
|
||||
"Template:Road disambiguation",
|
||||
"Template:School disambiguation",
|
||||
"Template:Species Latin name abbreviation disambiguation",
|
||||
"Template:Species Latin name disambiguation",
|
||||
"Template:Station disambiguation",
|
||||
"Template:Synagogue disambiguation",
|
||||
"Template:Taxonomic authority disambiguation",
|
||||
"Template:Taxonomy disambiguation",
|
||||
"Template:Template disambiguation",
|
||||
"Template:WoO number disambiguation",
|
||||
]
|
||||
|
||||
|
||||
def link_params(enwiki: str) -> dict[str, str | int]:
|
||||
"""Parameters for finding article links from the API."""
|
||||
params: dict[str, str | int] = {
|
||||
"action": "query",
|
||||
"format": "json",
|
||||
"formatversion": 2,
|
||||
"titles": enwiki,
|
||||
"generator": "links",
|
||||
"gpllimit": "max",
|
||||
"gplnamespace": 0,
|
||||
"tllimit": "max",
|
||||
"tlnamespace": 10,
|
||||
"tltemplates": "|".join(disambig_templates),
|
||||
"prop": "templates",
|
||||
}
|
||||
return params
|
||||
|
||||
|
||||
def needs_disambig(link: dict[str, Any]) -> bool:
|
||||
"""Is this a disambiguation link."""
|
||||
return bool(
|
||||
not link["title"].endswith(" (disambiguation)") and link.get("templates")
|
||||
)
|
||||
|
||||
|
||||
def get_article_links(enwiki: str) -> list[str]:
|
||||
"""Get links that appear in this article."""
|
||||
url = "https://en.wikipedia.org/w/api.php"
|
||||
|
||||
params: dict[str, str | int] = link_params(enwiki)
|
||||
links: set[str] = set()
|
||||
|
||||
while True:
|
||||
data = requests.get(url, params=params).json()
|
||||
links.update(
|
||||
page["title"] for page in data["query"]["pages"] if needs_disambig(page)
|
||||
)
|
||||
|
||||
if "continue" not in data:
|
||||
break
|
||||
|
||||
params["gplcontinue"] = data["continue"]["gplcontinue"]
|
||||
|
||||
return list(links)
|
||||
|
||||
# return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
|
||||
|
||||
|
||||
def delete_toc(root: lxml.html.HtmlElement) -> None:
|
||||
"""Delete table of contents from article HTML."""
|
||||
for toc in root.findall(".//div[@class='toc']"):
|
||||
toc.getparent().remove(toc)
|
||||
|
||||
|
||||
def get_dab_html(dab_num: int, title: str) -> str:
|
||||
"""Parse dab page and rewrite links."""
|
||||
dab_html = get_article_html(title)
|
||||
root = lxml.html.fromstring(dab_html)
|
||||
delete_toc(root)
|
||||
|
||||
element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")}
|
||||
|
||||
for a in root.findall(".//a[@href]"):
|
||||
href: str | None = a.get("href")
|
||||
if not href:
|
||||
continue
|
||||
if not href.startswith("#"):
|
||||
a.set("href", "#")
|
||||
a.set("onclick", f"return select_dab(this, {dab_num})")
|
||||
continue
|
||||
|
||||
destination_element = element_id_map[href[1:]]
|
||||
assert destination_element is not None
|
||||
destination_element.set("id", f"{dab_num}{href[1:]}")
|
||||
a.set("href", f"#{dab_num}{href[1:]}")
|
||||
|
||||
html: str = lxml.html.tostring(root, encoding=str)
|
||||
return html
|
||||
|
||||
|
||||
def make_disamb_link(edit: tuple[str, str]) -> str:
|
||||
"""Given an edit return the appropriate link."""
|
||||
return f"[[{edit[1]}|{edit[0]}]]"
|
||||
|
@ -112,7 +266,7 @@ def save(enwiki: str) -> Response | str:
|
|||
|
||||
edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]"
|
||||
|
||||
article_text = apply_edits(mediawiki_api.get_content(enwiki), edits)
|
||||
article_text = apply_edits(get_content(enwiki), edits)
|
||||
|
||||
return flask.render_template(
|
||||
"save.html",
|
||||
|
@ -123,25 +277,85 @@ def save(enwiki: str) -> Response | str:
|
|||
)
|
||||
|
||||
|
||||
def redirect_if_needed(enwiki: str) -> Optional[Response]:
|
||||
"""Check if there are spaces in the article name and redirect."""
|
||||
return (
|
||||
flask.redirect(
|
||||
flask.url_for(flask.request.endpoint, enwiki=enwiki.replace(" ", "_"))
|
||||
)
|
||||
if " " in enwiki
|
||||
else None
|
||||
)
|
||||
class DabItem(TypedDict):
|
||||
"""Represent a disabiguation page."""
|
||||
|
||||
num: int
|
||||
title: str
|
||||
html: str
|
||||
|
||||
|
||||
class Article:
|
||||
"""Current article we're working on."""
|
||||
|
||||
def __init__(self, enwiki: str) -> None:
|
||||
"""Make a new Article object."""
|
||||
self.enwiki = enwiki
|
||||
|
||||
self.links = get_article_links(enwiki)
|
||||
|
||||
self.dab_list: list[DabItem] = []
|
||||
self.dab_lookup: dict[int, str] = {}
|
||||
self.dab_order: list[str] = []
|
||||
self.parse: Optional[dict[str, Any]] = None
|
||||
|
||||
def save_endpoint(self) -> str:
|
||||
"""Endpoint for saving changes."""
|
||||
href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_"))
|
||||
return href
|
||||
|
||||
def load(self) -> None:
|
||||
"""Load parsed article HTML."""
|
||||
self.parse = call_parse_api(self.enwiki)
|
||||
self.root = lxml.html.fromstring(self.parse.pop("text"))
|
||||
|
||||
def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
|
||||
"""Disambiguation links that need fixing."""
|
||||
seen = set()
|
||||
for a in self.root.findall(".//a[@href]"):
|
||||
title = a.get("title")
|
||||
if title is None or title not in self.links:
|
||||
continue
|
||||
a.set("class", "disambig")
|
||||
|
||||
if title in seen:
|
||||
continue
|
||||
seen.add(title)
|
||||
|
||||
yield a, title
|
||||
|
||||
def process_links(self) -> None:
|
||||
"""Process links in parsed wikitext."""
|
||||
for dab_num, (a, title) in enumerate(self.iter_links()):
|
||||
a.set("id", f"dab-{dab_num}")
|
||||
|
||||
dab: DabItem = {
|
||||
"num": dab_num,
|
||||
"title": title,
|
||||
"html": get_dab_html(dab_num, title),
|
||||
}
|
||||
self.dab_list.append(dab)
|
||||
self.dab_order.append(title)
|
||||
self.dab_lookup[dab_num] = title
|
||||
|
||||
def get_html(self) -> str:
|
||||
"""Return the processed article HTML."""
|
||||
html: str = lxml.html.tostring(self.root, encoding=str)
|
||||
return html
|
||||
|
||||
|
||||
@app.route("/enwiki/<path:enwiki>")
|
||||
def article_page(enwiki: str) -> Response:
|
||||
"""Article Page."""
|
||||
redirect = redirect_if_needed(enwiki)
|
||||
if redirect:
|
||||
return redirect
|
||||
enwiki_orig = enwiki
|
||||
enwiki = enwiki.replace("_", " ")
|
||||
enwiki_underscore = enwiki.replace(" ", "_")
|
||||
if " " in enwiki_orig:
|
||||
return flask.redirect(
|
||||
flask.url_for(flask.request.endpoint, enwiki=enwiki_underscore)
|
||||
)
|
||||
|
||||
article = wikipedia.Article(enwiki)
|
||||
article = Article(enwiki)
|
||||
article.load()
|
||||
article.process_links()
|
||||
|
||||
|
|
Loading…
Reference in a new issue