From 5ff4749512a257e5a6ba5c787a3fd132d6fe9213 Mon Sep 17 00:00:00 2001
From: Edward Betts <edward@4angle.com>
Date: Wed, 17 Aug 2022 13:34:17 +0100
Subject: [PATCH 01/10] Formatting for article list

---
 templates/index.html | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/templates/index.html b/templates/index.html
index 21e1b5f..1758d8e 100644
--- a/templates/index.html
+++ b/templates/index.html
@@ -1,12 +1,14 @@
 {% extends "base.html" %}
 
 {% block content %}
-  <ul>
+<div class="m-3">
+  <ol>
     {% for enwiki, count in articles %}
     <li>
       <a href="{{ url_for("article_page", enwiki=enwiki) }}">{{ enwiki }}
       ({{ count }} links)
     </li>
     {% endfor %}
-  </ul>
+  </ol>
+</div>
 {% endblock %}

From 8432632aae9a6b9f52e5d10fced24d94f4c4725c Mon Sep 17 00:00:00 2001
From: Edward Betts <edward@4angle.com>
Date: Wed, 17 Aug 2022 13:34:55 +0100
Subject: [PATCH 02/10] Show more articles on index page

---
 web_view.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/web_view.py b/web_view.py
index 61ba0d1..03f1ccb 100755
--- a/web_view.py
+++ b/web_view.py
@@ -23,6 +23,8 @@ wiki_hostname = "en.wikipedia.org"
 wiki_api_php = f"https://{wiki_hostname}/w/api.php"
 wiki_index_php = f"https://{wiki_hostname}/w/index.php"
 
+awdl_url = "https://dplbot.toolforge.org/articles_with_dab_links.php"
+
 
 @app.before_request
 def global_user():
@@ -78,8 +80,7 @@ def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, in
 
 @app.route("/")
 def index():
-
-    r = requests.get("https://dplbot.toolforge.org/articles_with_dab_links.php")
+    r = requests.get(awdl_url, params={"limit": 100})
     root = lxml.html.fromstring(r.content)
     articles = parse_articles_with_dab_links(root)
 

From 78de5cc139b1f927c7b59c7e3bdcd502d8843927 Mon Sep 17 00:00:00 2001
From: Edward Betts <edward@4angle.com>
Date: Wed, 17 Aug 2022 13:35:26 +0100
Subject: [PATCH 03/10] Split code into another file

---
 dab_mechanic/wikipedia.py | 220 ++++++++++++++++++++++++++++++++++++++
 web_view.py               | 211 +-----------------------------------
 2 files changed, 222 insertions(+), 209 deletions(-)
 create mode 100644 dab_mechanic/wikipedia.py

diff --git a/dab_mechanic/wikipedia.py b/dab_mechanic/wikipedia.py
new file mode 100644
index 0000000..3b59278
--- /dev/null
+++ b/dab_mechanic/wikipedia.py
@@ -0,0 +1,220 @@
+from collections import defaultdict
+from typing import Any, Iterator, Optional, TypedDict
+
+import flask
+import lxml.html
+import requests
+
+disambig_templates = [
+    "Template:Disambiguation",
+    "Template:Airport disambiguation",
+    "Template:Biology disambiguation",
+    "Template:Call sign disambiguation",
+    "Template:Caselaw disambiguation",
+    "Template:Chinese title disambiguation",
+    "Template:Disambiguation cleanup",
+    "Template:Genus disambiguation",
+    "Template:Hospital disambiguation",
+    "Template:Human name disambiguation",
+    "Template:Human name disambiguation cleanup",
+    "Template:Letter-number combination disambiguation",
+    "Template:Mathematical disambiguation",
+    "Template:Military unit disambiguation",
+    "Template:Music disambiguation",
+    "Template:Number disambiguation",
+    "Template:Opus number disambiguation",
+    "Template:Phonetics disambiguation",
+    "Template:Place name disambiguation",
+    "Template:Portal disambiguation",
+    "Template:Road disambiguation",
+    "Template:School disambiguation",
+    "Template:Species Latin name abbreviation disambiguation",
+    "Template:Species Latin name disambiguation",
+    "Template:Station disambiguation",
+    "Template:Synagogue disambiguation",
+    "Template:Taxonomic authority disambiguation",
+    "Template:Taxonomy disambiguation",
+    "Template:Template disambiguation",
+    "Template:WoO number disambiguation",
+]
+
+
+def link_params(enwiki: str) -> dict[str, str | int]:
+    """Parameters for finding article links from the API."""
+    params: dict[str, str | int] = {
+        "action": "query",
+        "format": "json",
+        "formatversion": 2,
+        "titles": enwiki,
+        "generator": "links",
+        "gpllimit": "max",
+        "gplnamespace": 0,
+        "tllimit": "max",
+        "redirects": 1,
+        "tlnamespace": 10,
+        "tltemplates": "|".join(disambig_templates),
+        "prop": "templates",
+    }
+    return params
+
+
+def needs_disambig(link: dict[str, Any]) -> bool:
+    """Is this a disambiguation link."""
+    return bool(
+        not link["title"].endswith(" (disambiguation)") and link.get("templates")
+    )
+
+
+def get_article_links(enwiki: str) -> list[str]:
+    """Get links that appear in this article."""
+    url = "https://en.wikipedia.org/w/api.php"
+
+    params: dict[str, str | int] = link_params(enwiki)
+    links: set[str] = set()
+
+    redirects = defaultdict(set)
+
+    while True:
+        data = requests.get(url, params=params).json()
+        pages = data["query"].pop("pages")
+        for r in data["query"].pop("redirects"):
+            redirects[r["to"]].add(r["from"])
+
+        links.update(page["title"] for page in pages if needs_disambig(page))
+
+        if "continue" not in data:
+            break
+
+        params["gplcontinue"] = data["continue"]["gplcontinue"]
+
+    for link in set(links):
+        if link in redirects:
+            links.update(redirects[link])
+
+    return list(links)
+
+    # return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
+
+
+def call_parse_api(enwiki: str) -> dict[str, Any]:
+    """Call mediawiki parse API for given article."""
+    url = "https://en.wikipedia.org/w/api.php"
+
+    params: dict[str, str | int] = {
+        "action": "parse",
+        "format": "json",
+        "formatversion": 2,
+        "disableeditsection": 1,
+        "page": enwiki,
+        "prop": "text|links|headhtml",
+        "disabletoc": 1,
+    }
+
+    r = requests.get(url, params=params)
+    parse: dict[str, Any] = r.json()["parse"]
+    return parse
+
+
+def get_article_html(enwiki: str) -> str:
+    """Parse article wikitext and return HTML."""
+    text: str = call_parse_api(enwiki)["text"]
+    return text
+
+
+class DabItem(TypedDict):
+    """Represent a disabiguation page."""
+
+    num: int
+    title: str
+    html: str
+
+
+def delete_toc(root: lxml.html.HtmlElement) -> None:
+    """Delete table of contents from article HTML."""
+    for toc in root.findall(".//div[@class='toc']"):
+        toc.getparent().remove(toc)
+
+
+def get_dab_html(dab_num: int, title: str) -> str:
+    """Parse dab page and rewrite links."""
+    dab_html = get_article_html(title)
+    root = lxml.html.fromstring(dab_html)
+    delete_toc(root)
+
+    element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")}
+
+    for a in root.findall(".//a[@href]"):
+        href: str | None = a.get("href")
+        if not href:
+            continue
+        if not href.startswith("#"):
+            a.set("href", "#")
+            a.set("onclick", f"return select_dab(this, {dab_num})")
+            continue
+
+        destination_element = element_id_map[href[1:]]
+        assert destination_element is not None
+        destination_element.set("id", f"{dab_num}{href[1:]}")
+        a.set("href", f"#{dab_num}{href[1:]}")
+
+    html: str = lxml.html.tostring(root, encoding=str)
+    return html
+
+
+class Article:
+    """Current article we're working on."""
+
+    def __init__(self, enwiki: str) -> None:
+        """Make a new Article object."""
+        self.enwiki = enwiki
+
+        self.links = get_article_links(enwiki)
+
+        self.dab_list: list[DabItem] = []
+        self.dab_lookup: dict[int, str] = {}
+        self.dab_order: list[str] = []
+        self.parse: Optional[dict[str, Any]] = None
+
+    def save_endpoint(self) -> str:
+        """Endpoint for saving changes."""
+        href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_"))
+        return href
+
+    def load(self) -> None:
+        """Load parsed article HTML."""
+        self.parse = call_parse_api(self.enwiki)
+        self.root = lxml.html.fromstring(self.parse.pop("text"))
+
+    def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
+        """Disambiguation links that need fixing."""
+        seen = set()
+        for a in self.root.findall(".//a[@href]"):
+            title = a.get("title")
+            if title is None or title not in self.links:
+                continue
+            a.set("class", "disambig")
+
+            if title in seen:
+                continue
+            seen.add(title)
+
+            yield a, title
+
+    def process_links(self) -> None:
+        """Process links in parsed wikitext."""
+        for dab_num, (a, title) in enumerate(self.iter_links()):
+            a.set("id", f"dab-{dab_num}")
+
+            dab: DabItem = {
+                "num": dab_num,
+                "title": title,
+                "html": get_dab_html(dab_num, title),
+            }
+            self.dab_list.append(dab)
+            self.dab_order.append(title)
+            self.dab_lookup[dab_num] = title
+
+    def get_html(self) -> str:
+        """Return the processed article HTML."""
+        html: str = lxml.html.tostring(self.root, encoding=str)
+        return html
diff --git a/web_view.py b/web_view.py
index 03f1ccb..e6c33d4 100755
--- a/web_view.py
+++ b/web_view.py
@@ -3,7 +3,6 @@
 import inspect
 import json
 import re
-from typing import Any, Iterator, Optional, TypedDict
 
 import flask
 import lxml.html
@@ -13,7 +12,7 @@ from requests_oauthlib import OAuth1Session
 from werkzeug.debug.tbtools import get_current_traceback
 from werkzeug.wrappers import Response
 
-from dab_mechanic import wikidata_oauth
+from dab_mechanic import wikidata_oauth, wikipedia
 
 app = flask.Flask(__name__)
 app.config.from_object("config.default")
@@ -89,145 +88,6 @@ def index():
     return flask.render_template("index.html", articles=articles)
 
 
-def call_parse_api(enwiki: str) -> dict[str, Any]:
-    """Call mediawiki parse API for given article."""
-    url = "https://en.wikipedia.org/w/api.php"
-
-    params: dict[str, str | int] = {
-        "action": "parse",
-        "format": "json",
-        "formatversion": 2,
-        "disableeditsection": 1,
-        "page": enwiki,
-        "prop": "text|links|headhtml",
-        "disabletoc": 1,
-    }
-
-    r = requests.get(url, params=params)
-    parse: dict[str, Any] = r.json()["parse"]
-    return parse
-
-
-def get_article_html(enwiki: str) -> str:
-    """Parse article wikitext and return HTML."""
-    text: str = call_parse_api(enwiki)["text"]
-    return text
-
-
-disambig_templates = [
-    "Template:Disambiguation",
-    "Template:Airport disambiguation",
-    "Template:Biology disambiguation",
-    "Template:Call sign disambiguation",
-    "Template:Caselaw disambiguation",
-    "Template:Chinese title disambiguation",
-    "Template:Disambiguation cleanup",
-    "Template:Genus disambiguation",
-    "Template:Hospital disambiguation",
-    "Template:Human name disambiguation",
-    "Template:Human name disambiguation cleanup",
-    "Template:Letter-number combination disambiguation",
-    "Template:Mathematical disambiguation",
-    "Template:Military unit disambiguation",
-    "Template:Music disambiguation",
-    "Template:Number disambiguation",
-    "Template:Opus number disambiguation",
-    "Template:Phonetics disambiguation",
-    "Template:Place name disambiguation",
-    "Template:Portal disambiguation",
-    "Template:Road disambiguation",
-    "Template:School disambiguation",
-    "Template:Species Latin name abbreviation disambiguation",
-    "Template:Species Latin name disambiguation",
-    "Template:Station disambiguation",
-    "Template:Synagogue disambiguation",
-    "Template:Taxonomic authority disambiguation",
-    "Template:Taxonomy disambiguation",
-    "Template:Template disambiguation",
-    "Template:WoO number disambiguation",
-]
-
-
-def link_params(enwiki: str) -> dict[str, str | int]:
-    """Parameters for finding article links from the API."""
-    params: dict[str, str | int] = {
-        "action": "query",
-        "format": "json",
-        "formatversion": 2,
-        "titles": enwiki,
-        "generator": "links",
-        "gpllimit": "max",
-        "gplnamespace": 0,
-        "tllimit": "max",
-        "tlnamespace": 10,
-        "tltemplates": "|".join(disambig_templates),
-        "prop": "templates",
-    }
-    return params
-
-
-def needs_disambig(link: dict[str, Any]) -> bool:
-    """Is this a disambiguation link."""
-    return bool(
-        not link["title"].endswith(" (disambiguation)") and link.get("templates")
-    )
-
-
-def get_article_links(enwiki: str) -> list[str]:
-    """Get links that appear in this article."""
-    url = "https://en.wikipedia.org/w/api.php"
-
-    params: dict[str, str | int] = link_params(enwiki)
-    links: set[str] = set()
-
-    while True:
-        data = requests.get(url, params=params).json()
-        links.update(
-            page["title"] for page in data["query"]["pages"] if needs_disambig(page)
-        )
-
-        if "continue" not in data:
-            break
-
-        params["gplcontinue"] = data["continue"]["gplcontinue"]
-
-    return list(links)
-
-    # return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
-
-
-def delete_toc(root: lxml.html.HtmlElement) -> None:
-    """Delete table of contents from article HTML."""
-    for toc in root.findall(".//div[@class='toc']"):
-        toc.getparent().remove(toc)
-
-
-def get_dab_html(dab_num: int, title: str) -> str:
-    """Parse dab page and rewrite links."""
-    dab_html = get_article_html(title)
-    root = lxml.html.fromstring(dab_html)
-    delete_toc(root)
-
-    element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")}
-
-    for a in root.findall(".//a[@href]"):
-        href: str | None = a.get("href")
-        if not href:
-            continue
-        if not href.startswith("#"):
-            a.set("href", "#")
-            a.set("onclick", f"return select_dab(this, {dab_num})")
-            continue
-
-        destination_element = element_id_map[href[1:]]
-        assert destination_element is not None
-        destination_element.set("id", f"{dab_num}{href[1:]}")
-        a.set("href", f"#{dab_num}{href[1:]}")
-
-    html: str = lxml.html.tostring(root, encoding=str)
-    return html
-
-
 def make_disamb_link(edit: tuple[str, str]) -> str:
     """Given an edit return the appropriate link."""
     return f"[[{edit[1]}|{edit[0]}]]"
@@ -278,73 +138,6 @@ def save(enwiki: str) -> Response | str:
     )
 
 
-class DabItem(TypedDict):
-    """Represent a disabiguation page."""
-
-    num: int
-    title: str
-    html: str
-
-
-class Article:
-    """Current article we're working on."""
-
-    def __init__(self, enwiki: str) -> None:
-        """Make a new Article object."""
-        self.enwiki = enwiki
-
-        self.links = get_article_links(enwiki)
-
-        self.dab_list: list[DabItem] = []
-        self.dab_lookup: dict[int, str] = {}
-        self.dab_order: list[str] = []
-        self.parse: Optional[dict[str, Any]] = None
-
-    def save_endpoint(self) -> str:
-        """Endpoint for saving changes."""
-        href: str = flask.url_for("save", enwiki=self.enwiki.replace(" ", "_"))
-        return href
-
-    def load(self) -> None:
-        """Load parsed article HTML."""
-        self.parse = call_parse_api(self.enwiki)
-        self.root = lxml.html.fromstring(self.parse.pop("text"))
-
-    def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
-        """Disambiguation links that need fixing."""
-        seen = set()
-        for a in self.root.findall(".//a[@href]"):
-            title = a.get("title")
-            if title is None or title not in self.links:
-                continue
-            a.set("class", "disambig")
-
-            if title in seen:
-                continue
-            seen.add(title)
-
-            yield a, title
-
-    def process_links(self) -> None:
-        """Process links in parsed wikitext."""
-        for dab_num, (a, title) in enumerate(self.iter_links()):
-            a.set("id", f"dab-{dab_num}")
-
-            dab: DabItem = {
-                "num": dab_num,
-                "title": title,
-                "html": get_dab_html(dab_num, title),
-            }
-            self.dab_list.append(dab)
-            self.dab_order.append(title)
-            self.dab_lookup[dab_num] = title
-
-    def get_html(self) -> str:
-        """Return the processed article HTML."""
-        html: str = lxml.html.tostring(self.root, encoding=str)
-        return html
-
-
 @app.route("/enwiki/<path:enwiki>")
 def article_page(enwiki: str) -> Response:
     """Article Page."""
@@ -356,7 +149,7 @@ def article_page(enwiki: str) -> Response:
             flask.url_for(flask.request.endpoint, enwiki=enwiki_underscore)
         )
 
-    article = Article(enwiki)
+    article = wikipedia.Article(enwiki)
     article.load()
     article.process_links()
 

From 82688221816f53f45f49688cad9d146d6cc45742 Mon Sep 17 00:00:00 2001
From: Edward Betts <edward@4angle.com>
Date: Wed, 17 Aug 2022 13:46:00 +0100
Subject: [PATCH 04/10] Remove unused article list

---
 article_list | 50 --------------------------------------------------
 1 file changed, 50 deletions(-)
 delete mode 100644 article_list

diff --git a/article_list b/article_list
deleted file mode 100644
index 7f4bcaf..0000000
--- a/article_list
+++ /dev/null
@@ -1,50 +0,0 @@
-Rail transport in Indonesia
-SchleFaZ
-Chicago Bulls
-Orwell Prize
-List of fatal victims of the September 11 attacks
-Arabic exonyms
-Canadian Alpine Ski Championships
-Method Man filmography
-Popular Union
-The Cantos
-Unisex name
-United States Alpine Ski Championships
-AS Kaloum Star
-Akademi Fantasia (season 1)
-Athletics at the 2022 Bolivarian Games
-I Love the 2000s
-Kununokuni
-List of Wisin & Yandel collaborations
-List of comics based on films
-List of programs broadcast by Asianet
-Urban Hymns
-1979 Sydney City FC season
-2007 in Spanish television
-2022 World Athletics U20 Championships – Men's 4 × 100 metres relay
-A2 autostrada (Poland)
-Black to the Future (TV series)
-Chandel (Rajput clan)
-County of Isenburg
-Dinka people
-Dwayne McDuffie Award for Diversity in Comics
-FTSE Italia Mid Cap
-Globoplay
-Index of Armenia-related articles
-List of Equinox episodes
-List of Indian monarchs
-List of Italian exonyms in Dalmatia
-List of Ultimate Marvel characters
-List of cities with historical German exonyms
-List of jötnar in Norse mythology
-List of language families
-List of people with surname Davis
-List of political parties in Venezuela
-List of programmes broadcast by HTV
-Paul (given name)
-Principality of Lippe
-Propaganda in Russia
-Qazi Ghulam Mustafa
-Redfern Now
-Roy Orbison/The Beatles Tour
-Royal Birmingham Conservatoire

From c2b3d22e451daf22e4ba774ea0358826a9ee779b Mon Sep 17 00:00:00 2001
From: Edward Betts <edward@4angle.com>
Date: Wed, 17 Aug 2022 13:54:16 +0100
Subject: [PATCH 05/10] Add CSS for error page

---
 static/css/exception.css | 78 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 static/css/exception.css

diff --git a/static/css/exception.css b/static/css/exception.css
new file mode 100644
index 0000000..1f141c5
--- /dev/null
+++ b/static/css/exception.css
@@ -0,0 +1,78 @@
+div.debugger { text-align: left; padding: 12px; margin: auto;
+               background-color: white; }
+div.detail { cursor: pointer; }
+div.detail p { margin: 0 0 8px 13px; font-size: 14px; white-space: pre-wrap;
+               font-family: monospace; }
+div.explanation { margin: 20px 13px; font-size: 15px; color: #555; }
+div.footer   { font-size: 13px; text-align: right; margin: 30px 0;
+               color: #86989B; }
+
+h2           { font-size: 16px; margin: 1.3em 0 0.0 0; padding: 9px;
+               background-color: #11557C; color: white; }
+h2 em, h3 em { font-style: normal; color: #A5D6D9; font-weight: normal; }
+
+div.traceback, div.plain { border: 1px solid #ddd; margin: 0 0 1em 0; padding: 10px; }
+div.plain p      { margin: 0; }
+div.plain textarea,
+div.plain pre { margin: 10px 0 0 0; padding: 4px;
+                background-color: #E8EFF0; border: 1px solid #D3E7E9; }
+div.plain textarea { width: 99%; height: 300px; }
+div.traceback h3 { font-size: 1em; margin: 0 0 0.8em 0; }
+div.traceback ul { list-style: none; margin: 0; padding: 0 0 0 1em; }
+div.traceback h4 { font-size: 13px; font-weight: normal; margin: 0.7em 0 0.1em 0; }
+div.traceback pre { margin: 0; padding: 5px 0 3px 15px;
+                    background-color: #E8EFF0; border: 1px solid #D3E7E9; }
+div.traceback .library .current { background: white; color: #555; }
+div.traceback .expanded .current { background: #E8EFF0; color: black; }
+div.traceback pre:hover { background-color: #DDECEE; color: black; cursor: pointer; }
+div.traceback div.source.expanded pre + pre { border-top: none; }
+
+div.traceback span.ws { display: none; }
+div.traceback pre.before, div.traceback pre.after { display: none; background: white; }
+div.traceback div.source.expanded pre.before,
+div.traceback div.source.expanded pre.after {
+    display: block;
+}
+
+div.traceback div.source.expanded span.ws {
+    display: inline;
+}
+
+div.traceback blockquote { margin: 1em 0 0 0; padding: 0; white-space: pre-line; }
+div.traceback img { float: right; padding: 2px; margin: -3px 2px 0 0; display: none; }
+div.traceback img:hover { background-color: #ddd; cursor: pointer;
+                          border-color: #BFDDE0; }
+div.traceback pre:hover img { display: block; }
+div.traceback cite.filename { font-style: normal; color: #3B666B; }
+
+pre.console { border: 1px solid #ccc; background: white!important;
+              color: black; padding: 5px!important;
+              margin: 3px 0 0 0!important; cursor: default!important;
+              max-height: 400px; overflow: auto; }
+pre.console form { color: #555; }
+pre.console input { background-color: transparent; color: #555;
+                    width: 90%; font-family: 'Consolas', 'Deja Vu Sans Mono',
+                    'Bitstream Vera Sans Mono', monospace; font-size: 14px;
+                     border: none!important; }
+
+span.string { color: #30799B; }
+span.number { color: #9C1A1C; }
+span.help   { color: #3A7734; }
+span.object { color: #485F6E; }
+span.extended { opacity: 0.5; }
+span.extended:hover { opacity: 1; }
+a.toggle { text-decoration: none; background-repeat: no-repeat;
+           background-position: center center;
+           background-image: url(?__debugger__=yes&cmd=resource&f=more.png); }
+a.toggle:hover { background-color: #444; }
+a.open { background-image: url(?__debugger__=yes&cmd=resource&f=less.png); }
+
+div.traceback pre, div.console pre {
+    white-space: pre-wrap;       /* css-3 should we be so lucky... */
+    white-space: -moz-pre-wrap;  /* Mozilla, since 1999 */
+    white-space: -pre-wrap;      /* Opera 4-6 ?? */
+    white-space: -o-pre-wrap;    /* Opera 7 ?? */
+    word-wrap: break-word;       /* Internet Explorer 5.5+ */
+    _white-space: pre;           /* IE only hack to re-specify in
+                                 addition to word-wrap  */
+}

From 4e1ad4efbc523ca73ab8c433811aabd546218bc7 Mon Sep 17 00:00:00 2001
From: Edward Betts <edward@4angle.com>
Date: Wed, 17 Aug 2022 13:58:17 +0100
Subject: [PATCH 06/10] add gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bee8a64
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+__pycache__

From 5f8900a47abf4bf28e122d3b943bee16b62cc3c2 Mon Sep 17 00:00:00 2001
From: Edward Betts <edward@4angle.com>
Date: Wed, 17 Aug 2022 14:38:30 +0100
Subject: [PATCH 07/10] Add User-Agent to mediawiki API calls

---
 dab_mechanic/mediawiki_api.py | 48 +++++++++++++++++++++++++++++++++++
 dab_mechanic/wikipedia.py     | 29 ++++-----------------
 web_view.py                   | 19 ++------------
 3 files changed, 55 insertions(+), 41 deletions(-)
 create mode 100644 dab_mechanic/mediawiki_api.py

diff --git a/dab_mechanic/mediawiki_api.py b/dab_mechanic/mediawiki_api.py
new file mode 100644
index 0000000..0196207
--- /dev/null
+++ b/dab_mechanic/mediawiki_api.py
@@ -0,0 +1,48 @@
+"""Interface with the mediawiki API."""
+
+from typing import Any
+
+import requests
+
+wiki_hostname = "en.wikipedia.org"
+wiki_api_php = f"https://{wiki_hostname}/w/api.php"
+user_agent = "dab-mechanic/0.1"
+
+
+def parse_page(enwiki: str) -> dict[str, Any]:
+    """Call mediawiki parse API for given article."""
+    params: dict[str, str | int] = {
+        "action": "parse",
+        "format": "json",
+        "formatversion": 2,
+        "disableeditsection": 1,
+        "page": enwiki,
+        "prop": "text|links|headhtml",
+        "disabletoc": 1,
+    }
+
+    parse: dict[str, Any] = get(params)["parse"]
+    return parse
+
+
+def get(params: dict[str, str | int]) -> dict[str, Any]:
+    """Make GET request to mediawiki API."""
+    data: dict[str, Any] = requests.get(
+        wiki_api_php, headers={"User-Agent": user_agent}, params=params
+    ).json()
+    return data
+
+
+def get_content(title: str) -> str:
+    """Get article text."""
+    params: dict[str, str | int] = {
+        "action": "query",
+        "format": "json",
+        "formatversion": 2,
+        "prop": "revisions|info",
+        "rvprop": "content|timestamp",
+        "titles": title,
+    }
+    data = get(params)
+    rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
+    return rev
diff --git a/dab_mechanic/wikipedia.py b/dab_mechanic/wikipedia.py
index 3b59278..844508f 100644
--- a/dab_mechanic/wikipedia.py
+++ b/dab_mechanic/wikipedia.py
@@ -3,7 +3,8 @@ from typing import Any, Iterator, Optional, TypedDict
 
 import flask
 import lxml.html
-import requests
+
+from . import mediawiki_api
 
 disambig_templates = [
     "Template:Disambiguation",
@@ -67,7 +68,6 @@ def needs_disambig(link: dict[str, Any]) -> bool:
 
 def get_article_links(enwiki: str) -> list[str]:
     """Get links that appear in this article."""
-    url = "https://en.wikipedia.org/w/api.php"
 
     params: dict[str, str | int] = link_params(enwiki)
     links: set[str] = set()
@@ -75,7 +75,7 @@ def get_article_links(enwiki: str) -> list[str]:
     redirects = defaultdict(set)
 
     while True:
-        data = requests.get(url, params=params).json()
+        data = mediawiki_api.get(params)
         pages = data["query"].pop("pages")
         for r in data["query"].pop("redirects"):
             redirects[r["to"]].add(r["from"])
@@ -96,28 +96,9 @@ def get_article_links(enwiki: str) -> list[str]:
     # return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}
 
 
-def call_parse_api(enwiki: str) -> dict[str, Any]:
-    """Call mediawiki parse API for given article."""
-    url = "https://en.wikipedia.org/w/api.php"
-
-    params: dict[str, str | int] = {
-        "action": "parse",
-        "format": "json",
-        "formatversion": 2,
-        "disableeditsection": 1,
-        "page": enwiki,
-        "prop": "text|links|headhtml",
-        "disabletoc": 1,
-    }
-
-    r = requests.get(url, params=params)
-    parse: dict[str, Any] = r.json()["parse"]
-    return parse
-
-
 def get_article_html(enwiki: str) -> str:
     """Parse article wikitext and return HTML."""
-    text: str = call_parse_api(enwiki)["text"]
+    text: str = mediawiki_api.parse_page(enwiki)["text"]
     return text
 
 
@@ -182,7 +163,7 @@ class Article:
 
     def load(self) -> None:
         """Load parsed article HTML."""
-        self.parse = call_parse_api(self.enwiki)
+        self.parse = mediawiki_api.parse_page(self.enwiki)
         self.root = lxml.html.fromstring(self.parse.pop("text"))
 
     def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
diff --git a/web_view.py b/web_view.py
index e6c33d4..cac9233 100755
--- a/web_view.py
+++ b/web_view.py
@@ -12,7 +12,7 @@ from requests_oauthlib import OAuth1Session
 from werkzeug.debug.tbtools import get_current_traceback
 from werkzeug.wrappers import Response
 
-from dab_mechanic import wikidata_oauth, wikipedia
+from dab_mechanic import mediawiki_api, wikidata_oauth, wikipedia
 
 app = flask.Flask(__name__)
 app.config.from_object("config.default")
@@ -47,21 +47,6 @@ def exception_handler(e):
     )
 
 
-def get_content(title: str) -> str:
-    """Get article text."""
-    params: dict[str, str | int] = {
-        "action": "query",
-        "format": "json",
-        "formatversion": 2,
-        "prop": "revisions|info",
-        "rvprop": "content|timestamp",
-        "titles": title,
-    }
-    data = requests.get(wiki_api_php, params=params).json()
-    rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
-    return rev
-
-
 def parse_articles_with_dab_links(root: lxml.html.Element) -> list[tuple[str, int]]:
     """Parse Articles With Multiple Dablinks."""
     articles = []
@@ -127,7 +112,7 @@ def save(enwiki: str) -> Response | str:
 
     edit_summary = f"Disambiguate {titles} using [[User:Edward/Dab mechanic]]"
 
-    article_text = apply_edits(get_content(enwiki), edits)
+    article_text = apply_edits(mediawiki_api.get_content(enwiki), edits)
 
     return flask.render_template(
         "save.html",

From b1f402e1f9cb273d6603fa229441c3c84779ba3a Mon Sep 17 00:00:00 2001
From: Edward Betts <edward@4angle.com>
Date: Wed, 17 Aug 2022 14:48:20 +0100
Subject: [PATCH 08/10] refactor

---
 dab_mechanic/wikipedia.py |  2 +-
 web_view.py               | 22 +++++++++++++++-------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/dab_mechanic/wikipedia.py b/dab_mechanic/wikipedia.py
index 844508f..cf9510f 100644
--- a/dab_mechanic/wikipedia.py
+++ b/dab_mechanic/wikipedia.py
@@ -147,7 +147,7 @@ class Article:
 
     def __init__(self, enwiki: str) -> None:
         """Make a new Article object."""
-        self.enwiki = enwiki
+        self.enwiki = enwiki.replace("_", " ")
 
         self.links = get_article_links(enwiki)
 
diff --git a/web_view.py b/web_view.py
index cac9233..c50fa7d 100755
--- a/web_view.py
+++ b/web_view.py
@@ -3,6 +3,7 @@
 import inspect
 import json
 import re
+from typing import Optional
 
 import flask
 import lxml.html
@@ -123,16 +124,23 @@ def save(enwiki: str) -> Response | str:
     )
 
 
+def redirect_if_needed(enwiki: str) -> Optional[Response]:
+    """Check if there are spaces in the article name and redirect."""
+    return (
+        flask.redirect(
+            flask.url_for(flask.request.endpoint, enwiki=enwiki.replace(" ", "_"))
+        )
+        if " " in enwiki
+        else None
+    )
+
+
 @app.route("/enwiki/<path:enwiki>")
 def article_page(enwiki: str) -> Response:
     """Article Page."""
-    enwiki_orig = enwiki
-    enwiki = enwiki.replace("_", " ")
-    enwiki_underscore = enwiki.replace(" ", "_")
-    if " " in enwiki_orig:
-        return flask.redirect(
-            flask.url_for(flask.request.endpoint, enwiki=enwiki_underscore)
-        )
+    redirect = redirect_if_needed(enwiki)
+    if redirect:
+        return redirect
 
     article = wikipedia.Article(enwiki)
     article.load()

From e85cefbc2f302afdfd68567eb404a6e56ad4759d Mon Sep 17 00:00:00 2001
From: Edward Betts <edward@4angle.com>
Date: Wed, 17 Aug 2022 20:04:43 +0100
Subject: [PATCH 09/10] Make mediawiki API calls via OAuth

The API had a timeout problem. Maybe this fixes it.
---
 dab_mechanic/mediawiki_api.py  | 15 ++++++---------
 dab_mechanic/wikidata_oauth.py |  7 +++----
 dab_mechanic/wikipedia.py      |  7 ++++++-
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/dab_mechanic/mediawiki_api.py b/dab_mechanic/mediawiki_api.py
index 0196207..26d7a20 100644
--- a/dab_mechanic/mediawiki_api.py
+++ b/dab_mechanic/mediawiki_api.py
@@ -1,8 +1,7 @@
 """Interface with the mediawiki API."""
 
 from typing import Any
-
-import requests
+from . import wikidata_oauth
 
 wiki_hostname = "en.wikipedia.org"
 wiki_api_php = f"https://{wiki_hostname}/w/api.php"
@@ -21,16 +20,14 @@ def parse_page(enwiki: str) -> dict[str, Any]:
         "disabletoc": 1,
     }
 
-    parse: dict[str, Any] = get(params)["parse"]
+    parse: dict[str, Any] = call(params)["parse"]
     return parse
 
 
-def get(params: dict[str, str | int]) -> dict[str, Any]:
+def call(params: dict[str, str | int]) -> dict[str, Any]:
     """Make GET request to mediawiki API."""
-    data: dict[str, Any] = requests.get(
-        wiki_api_php, headers={"User-Agent": user_agent}, params=params
-    ).json()
-    return data
+    data: dict[str, Any] = wikidata_oauth.api_post_request(params)
+    return data.json()
 
 
 def get_content(title: str) -> str:
@@ -43,6 +40,6 @@ def get_content(title: str) -> str:
         "rvprop": "content|timestamp",
         "titles": title,
     }
-    data = get(params)
+    data = call(params)
     rev: str = data["query"]["pages"][0]["revisions"][0]["content"]
     return rev
diff --git a/dab_mechanic/wikidata_oauth.py b/dab_mechanic/wikidata_oauth.py
index dca0707..5af0976 100644
--- a/dab_mechanic/wikidata_oauth.py
+++ b/dab_mechanic/wikidata_oauth.py
@@ -19,7 +19,6 @@ def get_edit_proxy() -> dict[str, str]:
 def api_post_request(params: dict[str, str | int]):
     """HTTP Post using Oauth."""
     app = current_app
-    url = "https://www.wikidata.org/w/api.php"
     client_key = app.config["CLIENT_KEY"]
     client_secret = app.config["CLIENT_SECRET"]
     oauth = OAuth1Session(
@@ -29,12 +28,12 @@ def api_post_request(params: dict[str, str | int]):
         resource_owner_secret=session["owner_secret"],
     )
     proxies = get_edit_proxy()
-    return oauth.post(url, data=params, timeout=4, proxies=proxies)
+    return oauth.post(api_url, data=params, timeout=10, proxies=proxies)
 
 
 def raw_request(params):
     app = current_app
-    url = "https://www.wikidata.org/w/api.php?" + urlencode(params)
+    url = api_url + "?" + urlencode(params)
     client_key = app.config["CLIENT_KEY"]
     client_secret = app.config["CLIENT_SECRET"]
     oauth = OAuth1Session(
@@ -44,7 +43,7 @@ def raw_request(params):
         resource_owner_secret=session["owner_secret"],
     )
     proxies = get_edit_proxy()
-    return oauth.get(url, timeout=4, proxies=proxies)
+    return oauth.get(url, timeout=10, proxies=proxies)
 
 
 def api_request(params):
diff --git a/dab_mechanic/wikipedia.py b/dab_mechanic/wikipedia.py
index cf9510f..57c03c4 100644
--- a/dab_mechanic/wikipedia.py
+++ b/dab_mechanic/wikipedia.py
@@ -5,6 +5,8 @@ import flask
 import lxml.html
 
 from . import mediawiki_api
+from pprint import pprint
+from time import sleep
 
 disambig_templates = [
     "Template:Disambiguation",
@@ -75,7 +77,9 @@ def get_article_links(enwiki: str) -> list[str]:
     redirects = defaultdict(set)
 
     while True:
-        data = mediawiki_api.get(params)
+        data = mediawiki_api.call(params)
+        if "query" not in data:
+            pprint(data)
         pages = data["query"].pop("pages")
         for r in data["query"].pop("redirects"):
             redirects[r["to"]].add(r["from"])
@@ -86,6 +90,7 @@ def get_article_links(enwiki: str) -> list[str]:
             break
 
         params["gplcontinue"] = data["continue"]["gplcontinue"]
+        sleep(0.1)
 
     for link in set(links):
         if link in redirects:

From 4d175c8733b043a51f35d09cf11a4d92de34b498 Mon Sep 17 00:00:00 2001
From: Edward Betts <edward@4angle.com>
Date: Wed, 17 Aug 2022 20:06:36 +0100
Subject: [PATCH 10/10] Turn off app.debug so error pages works.

---
 web_view.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/web_view.py b/web_view.py
index c50fa7d..2730f23 100755
--- a/web_view.py
+++ b/web_view.py
@@ -17,7 +17,6 @@ from dab_mechanic import mediawiki_api, wikidata_oauth, wikipedia
 
 app = flask.Flask(__name__)
 app.config.from_object("config.default")
-app.debug = True
 
 wiki_hostname = "en.wikipedia.org"
 wiki_api_php = f"https://{wiki_hostname}/w/api.php"