Compare commits

...

10 commits

Author SHA1 Message Date
Edward Betts 4d175c8733 Turn off app.debug so error pages works. 2022-08-17 20:06:36 +01:00
Edward Betts e85cefbc2f Make mediawiki API calls via OAuth
The API had a timeout problem. Maybe this fixes it.
2022-08-17 20:04:43 +01:00
Edward Betts b1f402e1f9 refactor 2022-08-17 14:48:20 +01:00
Edward Betts 5f8900a47a Add User-Agent to mediawiki API calls 2022-08-17 14:38:30 +01:00
Edward Betts 4e1ad4efbc add gitignore 2022-08-17 13:58:17 +01:00
Edward Betts c2b3d22e45 Add CSS for error page 2022-08-17 13:54:16 +01:00
Edward Betts 8268822181 Remove unused article list 2022-08-17 13:46:00 +01:00
Edward Betts 78de5cc139 Split code into another file 2022-08-17 13:35:26 +01:00
Edward Betts 8432632aae Show more articles on index page 2022-08-17 13:34:55 +01:00
Edward Betts 5ff4749512 Formatting for article list 2022-08-17 13:34:17 +01:00
8 changed files with 356 additions and 289 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
__pycache__

View file

@ -1,50 +0,0 @@
Rail transport in Indonesia
SchleFaZ
Chicago Bulls
Orwell Prize
List of fatal victims of the September 11 attacks
Arabic exonyms
Canadian Alpine Ski Championships
Method Man filmography
Popular Union
The Cantos
Unisex name
United States Alpine Ski Championships
AS Kaloum Star
Akademi Fantasia (season 1)
Athletics at the 2022 Bolivarian Games
I Love the 2000s
Kununokuni
List of Wisin & Yandel collaborations
List of comics based on films
List of programs broadcast by Asianet
Urban Hymns
1979 Sydney City FC season
2007 in Spanish television
2022 World Athletics U20 Championships Men's 4 × 100 metres relay
A2 autostrada (Poland)
Black to the Future (TV series)
Chandel (Rajput clan)
County of Isenburg
Dinka people
Dwayne McDuffie Award for Diversity in Comics
FTSE Italia Mid Cap
Globoplay
Index of Armenia-related articles
List of Equinox episodes
List of Indian monarchs
List of Italian exonyms in Dalmatia
List of Ultimate Marvel characters
List of cities with historical German exonyms
List of jötnar in Norse mythology
List of language families
List of people with surname Davis
List of political parties in Venezuela
List of programmes broadcast by HTV
Paul (given name)
Principality of Lippe
Propaganda in Russia
Qazi Ghulam Mustafa
Redfern Now
Roy Orbison/The Beatles Tour
Royal Birmingham Conservatoire

View file

@ -0,0 +1,45 @@
"""Interface with the mediawiki API."""
from typing import Any
from . import wikidata_oauth
wiki_hostname = "en.wikipedia.org"
wiki_api_php = f"https://{wiki_hostname}/w/api.php"
user_agent = "dab-mechanic/0.1"
def parse_page(enwiki: str) -> dict[str, Any]:
"""Call mediawiki parse API for given article."""
params: dict[str, str | int] = {
"action": "parse",
"format": "json",
"formatversion": 2,
"disableeditsection": 1,
"page": enwiki,
"prop": "text|links|headhtml",
"disabletoc": 1,
}
parse: dict[str, Any] = call(params)["parse"]
return parse
def call(params: dict[str, str | int]) -> dict[str, Any]:
"""Make GET request to mediawiki API."""
data: dict[str, Any] = wikidata_oauth.api_post_request(params)
return data.json()
def get_content(title: str) -> str:
"""Get article text."""
params: dict[str, str | int] = {
"action": "query",
"format": "json",
"formatversion": 2,
"prop": "revisions|info",
"rvprop": "content|timestamp",
"titles": title,
}
data = call(params)
rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
return rev

View file

@ -19,7 +19,6 @@ def get_edit_proxy() -> dict[str, str]:
def api_post_request(params: dict[str, str | int]): def api_post_request(params: dict[str, str | int]):
"""HTTP Post using Oauth.""" """HTTP Post using Oauth."""
app = current_app app = current_app
url = "https://www.wikidata.org/w/api.php"
client_key = app.config["CLIENT_KEY"] client_key = app.config["CLIENT_KEY"]
client_secret = app.config["CLIENT_SECRET"] client_secret = app.config["CLIENT_SECRET"]
oauth = OAuth1Session( oauth = OAuth1Session(
@ -29,12 +28,12 @@ def api_post_request(params: dict[str, str | int]):
resource_owner_secret=session["owner_secret"], resource_owner_secret=session["owner_secret"],
) )
proxies = get_edit_proxy() proxies = get_edit_proxy()
return oauth.post(url, data=params, timeout=4, proxies=proxies) return oauth.post(api_url, data=params, timeout=10, proxies=proxies)
def raw_request(params): def raw_request(params):
app = current_app app = current_app
url = "https://www.wikidata.org/w/api.php?" + urlencode(params) url = api_url + "?" + urlencode(params)
client_key = app.config["CLIENT_KEY"] client_key = app.config["CLIENT_KEY"]
client_secret = app.config["CLIENT_SECRET"] client_secret = app.config["CLIENT_SECRET"]
oauth = OAuth1Session( oauth = OAuth1Session(
@ -44,7 +43,7 @@ def raw_request(params):
resource_owner_secret=session["owner_secret"], resource_owner_secret=session["owner_secret"],
) )
proxies = get_edit_proxy() proxies = get_edit_proxy()
return oauth.get(url, timeout=4, proxies=proxies) return oauth.get(url, timeout=10, proxies=proxies)
def api_request(params): def api_request(params):

206
dab_mechanic/wikipedia.py Normal file
View file

@ -0,0 +1,206 @@
from collections import defaultdict
from typing import Any, Iterator, Optional, TypedDict
import flask
import lxml.html
from . import mediawiki_api
from pprint import pprint
from time import sleep
disambig_templates = [
"Template:Disambiguation",
"Template:Airport disambiguation",
"Template:Biology disambiguation",
"Template:Call sign disambiguation",
"Template:Caselaw disambiguation",
"Template:Chinese title disambiguation",
"Template:Disambiguation cleanup",
"Template:Genus disambiguation",
"Template:Hospital disambiguation",
"Template:Human name disambiguation",
"Template:Human name disambiguation cleanup",
"Template:Letter-number combination disambiguation",
"Template:Mathematical disambiguation",
"Template:Military unit disambiguation",
"Template:Music disambiguation",
"Template:Number disambiguation",
"Template:Opus number disambiguation",
"Template:Phonetics disambiguation",
"Template:Place name disambiguation",
"Template:Portal disambiguation",
"Template:Road disambiguation",
"Template:School disambiguation",
"Template:Species Latin name abbreviation disambiguation",
"Template:Species Latin name disambiguation",
"Template:Station disambiguation",
"Template:Synagogue disambiguation",
"Template:Taxonomic authority disambiguation",
"Template:Taxonomy disambiguation",
"Template:Template disambiguation",
"Template:WoO number disambiguation",
]
def link_params(enwiki: str) -> dict[str, str | int]:
"""Parameters for finding article links from the API."""
params: dict[str, str | int] = {
"action": "query",
"format": "json",
"formatversion": 2,
"titles": enwiki,
"generator": "links",
"gpllimit": "max",
"gplnamespace": 0,
"tllimit": "max",
"redirects": 1,
"tlnamespace": 10,
"tltemplates": "|".join(disambig_templates),
"prop": "templates",
}
return params
def needs_disambig(link: dict[str, Any]) -> bool:
"""Is this a disambiguation link."""
return bool(
not link["title"].endswith(" (disambiguation)") and link.get("templates")
)
def get_article_links(enwiki: str) -> list[str]:
"""Get links that appear in this article."""
params: dict[str, str | int] = link_params(enwiki)
links: set[str] = set()
redirects = defaultdict(set)
while True:
data = mediawiki_api.call(params)
if "query" not in data:
pprint(data)
pages = data["query"].pop("pages")
for r in data["query"].pop("redirects"):
redirects[r["to"]].add(r["from"])
links.update(page["title"] for page in pages if needs_disambig(page))
if "continue" not in data:
break
params["gplcontinue"] = data["continue"]["gplcontinue"]
sleep(0.1)
for link in set(links):
if link in redirects:
links.update(redirects[link])
return list(links)
# return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
def get_article_html(enwiki: str) -> str:
"""Parse article wikitext and return HTML."""
text: str = mediawiki_api.parse_page(enwiki)["text"]
return text
class DabItem(TypedDict):
"""Represent a disabiguation page."""
num: int
title: str
html: str
def delete_toc(root: lxml.html.HtmlElement) -> None:
"""Delete table of contents from article HTML."""
for toc in root.findall(".//div[@class='toc']"):
toc.getparent().remove(toc)
def get_dab_html(dab_num: int, title: str) -> str:
"""Parse dab page and rewrite links."""
dab_html = get_article_html(title)
root = lxml.html.fromstring(dab_html)
delete_toc(root)
element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")}
for a in root.findall(".//a[@href]"):
href: str | None = a.get("href")
if not href:
continue
if not href.startswith("#"):
a.set("href", "#")
a.set("onclick", f"return select_dab(this, {dab_num})")
continue
destination_element = element_id_map[href[1:]]
assert destination_element is not None
destination_element.set("id", f"{dab_num}{href[1:]}")
a.set("href", f"#{dab_num}{href[1:]}")
html: str = lxml.html.tostring(root, encoding=str)
return html
class Article:
"""Current article we're working on."""
def __init__(self, enwiki: str) -> None:
"""Make a new Article object."""
self.enwiki = enwiki.replace("_", " ")
self.links = get_article_links(enwiki)
self.dab_list: list[DabItem] = []
self.dab_lookup: dict[int, str] = {}
self.dab_order: list[str] = []
self.parse: Optional[dict[str, Any]] = None
def save_endpoint(self) -> str:
"""Endpoint for saving changes."""
href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_"))
return href
def load(self) -> None:
"""Load parsed article HTML."""
self.parse = mediawiki_api.parse_page(self.enwiki)
self.root = lxml.html.fromstring(self.parse.pop("text"))
def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
"""Disambiguation links that need fixing."""
seen = set()
for a in self.root.findall(".//a[@href]"):
title = a.get("title")
if title is None or title not in self.links:
continue
a.set("class", "disambig")
if title in seen:
continue
seen.add(title)
yield a, title
def process_links(self) -> None:
"""Process links in parsed wikitext."""
for dab_num, (a, title) in enumerate(self.iter_links()):
a.set("id", f"dab-{dab_num}")
dab: DabItem = {
"num": dab_num,
"title": title,
"html": get_dab_html(dab_num, title),
}
self.dab_list.append(dab)
self.dab_order.append(title)
self.dab_lookup[dab_num] = title
def get_html(self) -> str:
"""Return the processed article HTML."""
html: str = lxml.html.tostring(self.root, encoding=str)
return html

78
static/css/exception.css Normal file
View file

@ -0,0 +1,78 @@
div.debugger { text-align: left; padding: 12px; margin: auto;
background-color: white; }
div.detail { cursor: pointer; }
div.detail p { margin: 0 0 8px 13px; font-size: 14px; white-space: pre-wrap;
font-family: monospace; }
div.explanation { margin: 20px 13px; font-size: 15px; color: #555; }
div.footer { font-size: 13px; text-align: right; margin: 30px 0;
color: #86989B; }
h2 { font-size: 16px; margin: 1.3em 0 0.0 0; padding: 9px;
background-color: #11557C; color: white; }
h2 em, h3 em { font-style: normal; color: #A5D6D9; font-weight: normal; }
div.traceback, div.plain { border: 1px solid #ddd; margin: 0 0 1em 0; padding: 10px; }
div.plain p { margin: 0; }
div.plain textarea,
div.plain pre { margin: 10px 0 0 0; padding: 4px;
background-color: #E8EFF0; border: 1px solid #D3E7E9; }
div.plain textarea { width: 99%; height: 300px; }
div.traceback h3 { font-size: 1em; margin: 0 0 0.8em 0; }
div.traceback ul { list-style: none; margin: 0; padding: 0 0 0 1em; }
div.traceback h4 { font-size: 13px; font-weight: normal; margin: 0.7em 0 0.1em 0; }
div.traceback pre { margin: 0; padding: 5px 0 3px 15px;
background-color: #E8EFF0; border: 1px solid #D3E7E9; }
div.traceback .library .current { background: white; color: #555; }
div.traceback .expanded .current { background: #E8EFF0; color: black; }
div.traceback pre:hover { background-color: #DDECEE; color: black; cursor: pointer; }
div.traceback div.source.expanded pre + pre { border-top: none; }
div.traceback span.ws { display: none; }
div.traceback pre.before, div.traceback pre.after { display: none; background: white; }
div.traceback div.source.expanded pre.before,
div.traceback div.source.expanded pre.after {
display: block;
}
div.traceback div.source.expanded span.ws {
display: inline;
}
div.traceback blockquote { margin: 1em 0 0 0; padding: 0; white-space: pre-line; }
div.traceback img { float: right; padding: 2px; margin: -3px 2px 0 0; display: none; }
div.traceback img:hover { background-color: #ddd; cursor: pointer;
border-color: #BFDDE0; }
div.traceback pre:hover img { display: block; }
div.traceback cite.filename { font-style: normal; color: #3B666B; }
pre.console { border: 1px solid #ccc; background: white!important;
color: black; padding: 5px!important;
margin: 3px 0 0 0!important; cursor: default!important;
max-height: 400px; overflow: auto; }
pre.console form { color: #555; }
pre.console input { background-color: transparent; color: #555;
width: 90%; font-family: 'Consolas', 'Deja Vu Sans Mono',
'Bitstream Vera Sans Mono', monospace; font-size: 14px;
border: none!important; }
span.string { color: #30799B; }
span.number { color: #9C1A1C; }
span.help { color: #3A7734; }
span.object { color: #485F6E; }
span.extended { opacity: 0.5; }
span.extended:hover { opacity: 1; }
a.toggle { text-decoration: none; background-repeat: no-repeat;
background-position: center center;
background-image: url(?__debugger__=yes&cmd=resource&f=more.png); }
a.toggle:hover { background-color: #444; }
a.open { background-image: url(?__debugger__=yes&cmd=resource&f=less.png); }
div.traceback pre, div.console pre {
white-space: pre-wrap; /* css-3 should we be so lucky... */
white-space: -moz-pre-wrap; /* Mozilla, since 1999 */
white-space: -pre-wrap; /* Opera 4-6 ?? */
white-space: -o-pre-wrap; /* Opera 7 ?? */
word-wrap: break-word; /* Internet Explorer 5.5+ */
_white-space: pre; /* IE only hack to re-specify in
addition to word-wrap */
}

View file

@ -1,12 +1,14 @@
{% extends "base.html" %} {% extends "base.html" %}
{% block content %} {% block content %}
<ul> <div class="m-3">
<ol>
{% for enwiki, count in articles %} {% for enwiki, count in articles %}
<li> <li>
<a href="{{ url_for("article_page", enwiki=enwiki) }}">{{ enwiki }} <a href="{{ url_for("article_page", enwiki=enwiki) }}">{{ enwiki }}
({{ count }} links) ({{ count }} links)
</li> </li>
{% endfor %} {% endfor %}
</ul> </ol>
</div>
{% endblock %} {% endblock %}

View file

@ -3,7 +3,7 @@
import inspect import inspect
import json import json
import re import re
from typing import Any, Iterator, Optional, TypedDict from typing import Optional
import flask import flask
import lxml.html import lxml.html
@ -13,16 +13,17 @@ from requests_oauthlib import OAuth1Session
from werkzeug.debug.tbtools import get_current_traceback from werkzeug.debug.tbtools import get_current_traceback
from werkzeug.wrappers import Response from werkzeug.wrappers import Response
from dab_mechanic import wikidata_oauth from dab_mechanic import mediawiki_api, wikidata_oauth, wikipedia
app = flask.Flask(__name__) app = flask.Flask(__name__)
app.config.from_object("config.default") app.config.from_object("config.default")
app.debug = True
wiki_hostname = "en.wikipedia.org" wiki_hostname = "en.wikipedia.org"
wiki_api_php = f"https://{wiki_hostname}/w/api.php" wiki_api_php = f"https://{wiki_hostname}/w/api.php"
wiki_index_php = f"https://{wiki_hostname}/w/index.php" wiki_index_php = f"https://{wiki_hostname}/w/index.php"
awdl_url = "https://dplbot.toolforge.org/articles_with_dab_links.php"
@app.before_request @app.before_request
def global_user(): def global_user():
@ -46,21 +47,6 @@ def exception_handler(e):
) )
def get_content(title: str) -> str:
"""Get article text."""
params: dict[str, str | int] = {
"action": "query",
"format": "json",
"formatversion": 2,
"prop": "revisions|info",
"rvprop": "content|timestamp",
"titles": title,
}
data = requests.get(wiki_api_php, params=params).json()
rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
return rev
def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, int]]: def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, int]]:
"""Parse Articles With Multiple Dablinks.""" """Parse Articles With Multiple Dablinks."""
articles = [] articles = []
@ -78,8 +64,7 @@ def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, in
@app.route("/") @app.route("/")
def index(): def index():
r = requests.get(awdl_url, params={"limit": 100})
r = requests.get("https://dplbot.toolforge.org/articles_with_dab_links.php")
root = lxml.html.fromstring(r.content) root = lxml.html.fromstring(r.content)
articles = parse_articles_with_dab_links(root) articles = parse_articles_with_dab_links(root)
@ -88,145 +73,6 @@ def index():
return flask.render_template("index.html", articles=articles) return flask.render_template("index.html", articles=articles)
def call_parse_api(enwiki: str) -> dict[str, Any]:
"""Call mediawiki parse API for given article."""
url = "https://en.wikipedia.org/w/api.php"
params: dict[str, str | int] = {
"action": "parse",
"format": "json",
"formatversion": 2,
"disableeditsection": 1,
"page": enwiki,
"prop": "text|links|headhtml",
"disabletoc": 1,
}
r = requests.get(url, params=params)
parse: dict[str, Any] = r.json()["parse"]
return parse
def get_article_html(enwiki: str) -> str:
"""Parse article wikitext and return HTML."""
text: str = call_parse_api(enwiki)["text"]
return text
disambig_templates = [
"Template:Disambiguation",
"Template:Airport disambiguation",
"Template:Biology disambiguation",
"Template:Call sign disambiguation",
"Template:Caselaw disambiguation",
"Template:Chinese title disambiguation",
"Template:Disambiguation cleanup",
"Template:Genus disambiguation",
"Template:Hospital disambiguation",
"Template:Human name disambiguation",
"Template:Human name disambiguation cleanup",
"Template:Letter-number combination disambiguation",
"Template:Mathematical disambiguation",
"Template:Military unit disambiguation",
"Template:Music disambiguation",
"Template:Number disambiguation",
"Template:Opus number disambiguation",
"Template:Phonetics disambiguation",
"Template:Place name disambiguation",
"Template:Portal disambiguation",
"Template:Road disambiguation",
"Template:School disambiguation",
"Template:Species Latin name abbreviation disambiguation",
"Template:Species Latin name disambiguation",
"Template:Station disambiguation",
"Template:Synagogue disambiguation",
"Template:Taxonomic authority disambiguation",
"Template:Taxonomy disambiguation",
"Template:Template disambiguation",
"Template:WoO number disambiguation",
]
def link_params(enwiki: str) -> dict[str, str | int]:
"""Parameters for finding article links from the API."""
params: dict[str, str | int] = {
"action": "query",
"format": "json",
"formatversion": 2,
"titles": enwiki,
"generator": "links",
"gpllimit": "max",
"gplnamespace": 0,
"tllimit": "max",
"tlnamespace": 10,
"tltemplates": "|".join(disambig_templates),
"prop": "templates",
}
return params
def needs_disambig(link: dict[str, Any]) -> bool:
"""Is this a disambiguation link."""
return bool(
not link["title"].endswith(" (disambiguation)") and link.get("templates")
)
def get_article_links(enwiki: str) -> list[str]:
"""Get links that appear in this article."""
url = "https://en.wikipedia.org/w/api.php"
params: dict[str, str | int] = link_params(enwiki)
links: set[str] = set()
while True:
data = requests.get(url, params=params).json()
links.update(
page["title"] for page in data["query"]["pages"] if needs_disambig(page)
)
if "continue" not in data:
break
params["gplcontinue"] = data["continue"]["gplcontinue"]
return list(links)
# return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
def delete_toc(root: lxml.html.HtmlElement) -> None:
"""Delete table of contents from article HTML."""
for toc in root.findall(".//div[@class='toc']"):
toc.getparent().remove(toc)
def get_dab_html(dab_num: int, title: str) -> str:
"""Parse dab page and rewrite links."""
dab_html = get_article_html(title)
root = lxml.html.fromstring(dab_html)
delete_toc(root)
element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")}
for a in root.findall(".//a[@href]"):
href: str | None = a.get("href")
if not href:
continue
if not href.startswith("#"):
a.set("href", "#")
a.set("onclick", f"return select_dab(this, {dab_num})")
continue
destination_element = element_id_map[href[1:]]
assert destination_element is not None
destination_element.set("id", f"{dab_num}{href[1:]}")
a.set("href", f"#{dab_num}{href[1:]}")
html: str = lxml.html.tostring(root, encoding=str)
return html
def make_disamb_link(edit: tuple[str, str]) -> str: def make_disamb_link(edit: tuple[str, str]) -> str:
"""Given an edit return the appropriate link.""" """Given an edit return the appropriate link."""
return f"[[{edit[1]}|{edit[0]}]]" return f"[[{edit[1]}|{edit[0]}]]"
@ -266,7 +112,7 @@ def save(enwiki: str) -> Response | str:
edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]" edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]"
article_text = apply_edits(get_content(enwiki), edits) article_text = apply_edits(mediawiki_api.get_content(enwiki), edits)
return flask.render_template( return flask.render_template(
"save.html", "save.html",
@ -277,85 +123,25 @@ def save(enwiki: str) -> Response | str:
) )
class DabItem(TypedDict): def redirect_if_needed(enwiki: str) -> Optional[Response]:
"""Represent a disabiguation page.""" """Check if there are spaces in the article name and redirect."""
return (
num: int flask.redirect(
title: str flask.url_for(flask.request.endpoint, enwiki=enwiki.replace(" ", "_"))
html: str )
if " " in enwiki
else None
class Article: )
"""Current article we're working on."""
def __init__(self, enwiki: str) -> None:
"""Make a new Article object."""
self.enwiki = enwiki
self.links = get_article_links(enwiki)
self.dab_list: list[DabItem] = []
self.dab_lookup: dict[int, str] = {}
self.dab_order: list[str] = []
self.parse: Optional[dict[str, Any]] = None
def save_endpoint(self) -> str:
"""Endpoint for saving changes."""
href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_"))
return href
def load(self) -> None:
"""Load parsed article HTML."""
self.parse = call_parse_api(self.enwiki)
self.root = lxml.html.fromstring(self.parse.pop("text"))
def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
"""Disambiguation links that need fixing."""
seen = set()
for a in self.root.findall(".//a[@href]"):
title = a.get("title")
if title is None or title not in self.links:
continue
a.set("class", "disambig")
if title in seen:
continue
seen.add(title)
yield a, title
def process_links(self) -> None:
"""Process links in parsed wikitext."""
for dab_num, (a, title) in enumerate(self.iter_links()):
a.set("id", f"dab-{dab_num}")
dab: DabItem = {
"num": dab_num,
"title": title,
"html": get_dab_html(dab_num, title),
}
self.dab_list.append(dab)
self.dab_order.append(title)
self.dab_lookup[dab_num] = title
def get_html(self) -> str:
"""Return the processed article HTML."""
html: str = lxml.html.tostring(self.root, encoding=str)
return html
@app.route("/enwiki/<path:enwiki>") @app.route("/enwiki/<path:enwiki>")
def article_page(enwiki: str) -> Response: def article_page(enwiki: str) -> Response:
"""Article Page.""" """Article Page."""
enwiki_orig = enwiki redirect = redirect_if_needed(enwiki)
enwiki = enwiki.replace("_", " ") if redirect:
enwiki_underscore = enwiki.replace(" ", "_") return redirect
if " " in enwiki_orig:
return flask.redirect(
flask.url_for(flask.request.endpoint, enwiki=enwiki_underscore)
)
article = Article(enwiki) article = wikipedia.Article(enwiki)
article.load() article.load()
article.process_links() article.process_links()