dab-mechanic/dab_mechanic/wikipedia.py

from collections import defaultdict
from typing import Any, Iterator, Optional, TypedDict

import flask
import lxml.html

from . import mediawiki_api
from pprint import pprint
from time import sleep

disambig_templates = [
    "Template:Disambiguation",
    "Template:Airport disambiguation",
    "Template:Biology disambiguation",
    "Template:Call sign disambiguation",
    "Template:Caselaw disambiguation",
    "Template:Chinese title disambiguation",
    "Template:Disambiguation cleanup",
    "Template:Genus disambiguation",
    "Template:Hospital disambiguation",
    "Template:Human name disambiguation",
    "Template:Human name disambiguation cleanup",
    "Template:Letter-number combination disambiguation",
    "Template:Mathematical disambiguation",
    "Template:Military unit disambiguation",
    "Template:Music disambiguation",
    "Template:Number disambiguation",
    "Template:Opus number disambiguation",
    "Template:Phonetics disambiguation",
    "Template:Place name disambiguation",
    "Template:Portal disambiguation",
    "Template:Road disambiguation",
    "Template:School disambiguation",
    "Template:Species Latin name abbreviation disambiguation",
    "Template:Species Latin name disambiguation",
    "Template:Station disambiguation",
    "Template:Synagogue disambiguation",
    "Template:Taxonomic authority disambiguation",
    "Template:Taxonomy disambiguation",
    "Template:Template disambiguation",
    "Template:WoO number disambiguation",
]


def link_params(enwiki: str) -> dict[str, str | int]:
    """Parameters for finding article links from the API."""
    params: dict[str, str | int] = {
        "action": "query",
        "format": "json",
        "formatversion": 2,
        "titles": enwiki,
        "generator": "links",
        "gpllimit": "max",
        "gplnamespace": 0,
        "tllimit": "max",
        "redirects": 1,
        "tlnamespace": 10,
        "tltemplates": "|".join(disambig_templates),
        "prop": "templates",
    }
    return params


def needs_disambig(link: dict[str, Any]) -> bool:
    """Is this a disambiguation link."""
    return bool(
        not link["title"].endswith(" (disambiguation)") and link.get("templates")
    )


def get_article_links(enwiki: str) -> dict[str, str]:
    """Get links that appear in this article."""

    params: dict[str, str | int] = link_params(enwiki)
    links: set[str] = set()

    redirects = defaultdict(set)

    while True:
        data = mediawiki_api.call(params)
        if "query" not in data:
            pprint(data)
        pages = data["query"].pop("pages")
        for r in data["query"].pop("redirects"):
            redirects[r["to"]].add(r["from"])

        links.update(page["title"] for page in pages if needs_disambig(page))

        if "continue" not in data:
            break

        params["gplcontinue"] = data["continue"]["gplcontinue"]
        sleep(0.1)

    ret_links = {}
    for link in set(links):
        ret_links[link] = link
        for r in redirects.get(link, []):
            ret_links[r] = link

    return ret_links

    # return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}


def get_article_html(enwiki: str) -> str:
    """Parse article wikitext and return HTML."""
    text: str = mediawiki_api.parse_page(enwiki)["text"]
    return text


class DabItem(TypedDict):
    """Represent a disabiguation page."""

    num: int
    title: str
    html: str


def delete_toc(root: lxml.html.HtmlElement) -> None:
    """Delete table of contents from article HTML."""
    for toc in root.findall(".//div[@class='toc']"):
        toc.getparent().remove(toc)


def get_dab_html(dab_num: int, html: str) -> str:
    """Parse dab page and rewrite links."""
    root = lxml.html.fromstring(html)
    delete_toc(root)

    element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")}

    for a in root.findall(".//a[@href]"):
        href: str | None = a.get("href")
        if not href:
            continue
        if not href.startswith("#"):
            a.set("href", "#")
            a.set("onclick", f"return select_dab(this, {dab_num})")
            continue

        destination_element = element_id_map[href[1:]]
        assert destination_element is not None
        destination_element.set("id", f"{dab_num}{href[1:]}")
        a.set("href", f"#{dab_num}{href[1:]}")

    html: str = lxml.html.tostring(root, encoding=str)
    return html


class Article:
    """Current article we're working on."""

    def __init__(self, enwiki: str) -> None:
        """Make a new Article object."""
        self.enwiki = enwiki.replace("_", " ")

        self.links = get_article_links(enwiki)

        self.dab_list: list[DabItem] = []
        self.dab_lookup: dict[int, str] = {}
        self.dab_order: list[str] = []
        self.parse: Optional[dict[str, Any]] = None
        self.dab_html: dict[str, str] = {}

    def preview_endpoint(self) -> str:
        """Endpoint for saving changes."""
        href: str = flask.url_for("preview", enwiki=self.enwiki.replace(" ", "_"))
        return href

    def load(self) -> None:
        """Load parsed article HTML."""
        self.parse = mediawiki_api.parse_page(self.enwiki)
        self.root = lxml.html.fromstring(self.parse.pop("text"))

    def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
        """Disambiguation links that need fixing."""
        for a in self.root.findall(".//a[@href]"):
            title = a.get("title")
            if title is not None and title in self.links:
                yield a, title, self.links[title]

            href = a.get("href")
            if not href.startswith("/wiki/"):
                continue
            a.set("href", "https://en.wikipedia.org" + href)
            a.set("target", "_blank")

    def dab_link_to(self):
        return [dab["link_to"] for dab in self.dab_list]

    def process_links(self) -> None:
        """Process links in parsed wikitext."""
        for dab_num, (a, link_to, title) in enumerate(self.iter_links()):
            a.set("class", "disambig")
            a.set("id", f"dab-{dab_num}")

            if title not in self.dab_html:
                self.dab_html[title] = get_article_html(title)

            dab: DabItem = {
                "num": dab_num,
                "title": title,
                "link_to": link_to,
                "html": get_dab_html(dab_num, self.dab_html[title]),
            }
            self.dab_list.append(dab)
            self.dab_order.append(title)
            self.dab_lookup[dab_num] = title

    def get_html(self) -> str:
        """Return the processed article HTML."""
        html: str = lxml.html.tostring(self.root, encoding=str)
        return html
Split code into another file 2022-08-17 13:35:26 +01:00			`from collections import defaultdict`
			`from typing import Any, Iterator, Optional, TypedDict`

			`import flask`
			`import lxml.html`
Add User-Agent to mediawiki API calls 2022-08-17 14:38:30 +01:00
			`from . import mediawiki_api`
Make mediawiki API calls via OAuth The API had a timeout problem. Maybe this fixes it. 2022-08-17 20:04:43 +01:00			`from pprint import pprint`
			`from time import sleep`
Split code into another file 2022-08-17 13:35:26 +01:00
			`disambig_templates = [`
			`"Template:Disambiguation",`
			`"Template:Airport disambiguation",`
			`"Template:Biology disambiguation",`
			`"Template:Call sign disambiguation",`
			`"Template:Caselaw disambiguation",`
			`"Template:Chinese title disambiguation",`
			`"Template:Disambiguation cleanup",`
			`"Template:Genus disambiguation",`
			`"Template:Hospital disambiguation",`
			`"Template:Human name disambiguation",`
			`"Template:Human name disambiguation cleanup",`
			`"Template:Letter-number combination disambiguation",`
			`"Template:Mathematical disambiguation",`
			`"Template:Military unit disambiguation",`
			`"Template:Music disambiguation",`
			`"Template:Number disambiguation",`
			`"Template:Opus number disambiguation",`
			`"Template:Phonetics disambiguation",`
			`"Template:Place name disambiguation",`
			`"Template:Portal disambiguation",`
			`"Template:Road disambiguation",`
			`"Template:School disambiguation",`
			`"Template:Species Latin name abbreviation disambiguation",`
			`"Template:Species Latin name disambiguation",`
			`"Template:Station disambiguation",`
			`"Template:Synagogue disambiguation",`
			`"Template:Taxonomic authority disambiguation",`
			`"Template:Taxonomy disambiguation",`
			`"Template:Template disambiguation",`
			`"Template:WoO number disambiguation",`
			`]`


			`def link_params(enwiki: str) -> dict[str, str \| int]:`
			`"""Parameters for finding article links from the API."""`
			`params: dict[str, str \| int] = {`
			`"action": "query",`
			`"format": "json",`
			`"formatversion": 2,`
			`"titles": enwiki,`
			`"generator": "links",`
			`"gpllimit": "max",`
			`"gplnamespace": 0,`
			`"tllimit": "max",`
			`"redirects": 1,`
			`"tlnamespace": 10,`
			`"tltemplates": "\|".join(disambig_templates),`
			`"prop": "templates",`
			`}`
			`return params`


			`def needs_disambig(link: dict[str, Any]) -> bool:`
			`"""Is this a disambiguation link."""`
			`return bool(`
			`not link["title"].endswith(" (disambiguation)") and link.get("templates")`
			`)`


WIP 2022-08-19 10:59:53 +01:00			`def get_article_links(enwiki: str) -> dict[str, str]:`
Split code into another file 2022-08-17 13:35:26 +01:00			`"""Get links that appear in this article."""`

			`params: dict[str, str \| int] = link_params(enwiki)`
			`links: set[str] = set()`

			`redirects = defaultdict(set)`

			`while True:`
Make mediawiki API calls via OAuth The API had a timeout problem. Maybe this fixes it. 2022-08-17 20:04:43 +01:00			`data = mediawiki_api.call(params)`
			`if "query" not in data:`
			`pprint(data)`
Split code into another file 2022-08-17 13:35:26 +01:00			`pages = data["query"].pop("pages")`
			`for r in data["query"].pop("redirects"):`
			`redirects[r["to"]].add(r["from"])`

			`links.update(page["title"] for page in pages if needs_disambig(page))`

			`if "continue" not in data:`
			`break`

			`params["gplcontinue"] = data["continue"]["gplcontinue"]`
Make mediawiki API calls via OAuth The API had a timeout problem. Maybe this fixes it. 2022-08-17 20:04:43 +01:00			`sleep(0.1)`
Split code into another file 2022-08-17 13:35:26 +01:00
WIP 2022-08-19 10:59:53 +01:00			`ret_links = {}`
Split code into another file 2022-08-17 13:35:26 +01:00			`for link in set(links):`
WIP 2022-08-19 10:59:53 +01:00			`ret_links[link] = link`
			`for r in redirects.get(link, []):`
			`ret_links[r] = link`
Split code into another file 2022-08-17 13:35:26 +01:00
WIP 2022-08-19 10:59:53 +01:00			`return ret_links`
Split code into another file 2022-08-17 13:35:26 +01:00
			`# return {link["title"] for link in r.json()["query"]["pages"][0]["links"]}`


			`def get_article_html(enwiki: str) -> str:`
			`"""Parse article wikitext and return HTML."""`
Add User-Agent to mediawiki API calls 2022-08-17 14:38:30 +01:00			`text: str = mediawiki_api.parse_page(enwiki)["text"]`
Split code into another file 2022-08-17 13:35:26 +01:00			`return text`


			`class DabItem(TypedDict):`
			`"""Represent a disabiguation page."""`

			`num: int`
			`title: str`
			`html: str`


			`def delete_toc(root: lxml.html.HtmlElement) -> None:`
			`"""Delete table of contents from article HTML."""`
			`for toc in root.findall(".//div[@class='toc']"):`
			`toc.getparent().remove(toc)`


Include all occurances of dab links, not just the first. 2022-08-18 20:51:18 +01:00			`def get_dab_html(dab_num: int, html: str) -> str:`
Split code into another file 2022-08-17 13:35:26 +01:00			`"""Parse dab page and rewrite links."""`
Include all occurances of dab links, not just the first. 2022-08-18 20:51:18 +01:00			`root = lxml.html.fromstring(html)`
Split code into another file 2022-08-17 13:35:26 +01:00			`delete_toc(root)`

			`element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")}`

			`for a in root.findall(".//a[@href]"):`
			`href: str \| None = a.get("href")`
			`if not href:`
			`continue`
			`if not href.startswith("#"):`
			`a.set("href", "#")`
			`a.set("onclick", f"return select_dab(this, {dab_num})")`
			`continue`

			`destination_element = element_id_map[href[1:]]`
			`assert destination_element is not None`
			`destination_element.set("id", f"{dab_num}{href[1:]}")`
			`a.set("href", f"#{dab_num}{href[1:]}")`

			`html: str = lxml.html.tostring(root, encoding=str)`
			`return html`


			`class Article:`
			`"""Current article we're working on."""`

			`def __init__(self, enwiki: str) -> None:`
			`"""Make a new Article object."""`
refactor 2022-08-17 14:48:20 +01:00			`self.enwiki = enwiki.replace("_", " ")`
Split code into another file 2022-08-17 13:35:26 +01:00
			`self.links = get_article_links(enwiki)`

			`self.dab_list: list[DabItem] = []`
			`self.dab_lookup: dict[int, str] = {}`
			`self.dab_order: list[str] = []`
			`self.parse: Optional[dict[str, Any]] = None`
Include all occurances of dab links, not just the first. 2022-08-18 20:51:18 +01:00			`self.dab_html: dict[str, str] = {}`
Split code into another file 2022-08-17 13:35:26 +01:00
WIP 2023-09-29 14:17:56 +01:00			`def preview_endpoint(self) -> str:`
Split code into another file 2022-08-17 13:35:26 +01:00			`"""Endpoint for saving changes."""`
WIP 2023-09-29 14:17:56 +01:00			`href: str = flask.url_for("preview", enwiki=self.enwiki.replace(" ", "_"))`
Split code into another file 2022-08-17 13:35:26 +01:00			`return href`

			`def load(self) -> None:`
			`"""Load parsed article HTML."""`
Add User-Agent to mediawiki API calls 2022-08-17 14:38:30 +01:00			`self.parse = mediawiki_api.parse_page(self.enwiki)`
Split code into another file 2022-08-17 13:35:26 +01:00			`self.root = lxml.html.fromstring(self.parse.pop("text"))`

			`def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:`
			`"""Disambiguation links that need fixing."""`
			`for a in self.root.findall(".//a[@href]"):`
			`title = a.get("title")`
WIP 2022-08-19 10:59:53 +01:00			`if title is not None and title in self.links:`
			`yield a, title, self.links[title]`

			`href = a.get("href")`
			`if not href.startswith("/wiki/"):`
Split code into another file 2022-08-17 13:35:26 +01:00			`continue`
WIP 2022-08-19 10:59:53 +01:00			`a.set("href", "https://en.wikipedia.org" + href)`
			`a.set("target", "_blank")`
Split code into another file 2022-08-17 13:35:26 +01:00
Rename save page to preview page 2022-08-21 11:31:21 +01:00			`def dab_link_to(self):`
			`return [dab["link_to"] for dab in self.dab_list]`

Split code into another file 2022-08-17 13:35:26 +01:00			`def process_links(self) -> None:`
			`"""Process links in parsed wikitext."""`
WIP 2022-08-19 10:59:53 +01:00			`for dab_num, (a, link_to, title) in enumerate(self.iter_links()):`
Include all occurances of dab links, not just the first. 2022-08-18 20:51:18 +01:00			`a.set("class", "disambig")`
Split code into another file 2022-08-17 13:35:26 +01:00			`a.set("id", f"dab-{dab_num}")`

Include all occurances of dab links, not just the first. 2022-08-18 20:51:18 +01:00			`if title not in self.dab_html:`
			`self.dab_html[title] = get_article_html(title)`

Split code into another file 2022-08-17 13:35:26 +01:00			`dab: DabItem = {`
			`"num": dab_num,`
			`"title": title,`
WIP 2022-08-19 10:59:53 +01:00			`"link_to": link_to,`
Include all occurances of dab links, not just the first. 2022-08-18 20:51:18 +01:00			`"html": get_dab_html(dab_num, self.dab_html[title]),`
Split code into another file 2022-08-17 13:35:26 +01:00			`}`
			`self.dab_list.append(dab)`
			`self.dab_order.append(title)`
			`self.dab_lookup[dab_num] = title`

			`def get_html(self) -> str:`
			`"""Return the processed article HTML."""`
			`html: str = lxml.html.tostring(self.root, encoding=str)`
			`return html`