Include all occurances of dab links, not just the first.

This commit is contained in:
Edward Betts 2022-08-18 20:51:18 +01:00
parent 46fa6cc63e
commit daf2a25458

View file

@ -121,10 +121,9 @@ def delete_toc(root: lxml.html.HtmlElement) -> None:
toc.getparent().remove(toc) toc.getparent().remove(toc)
def get_dab_html(dab_num: int, title: str) -> str: def get_dab_html(dab_num: int, html: str) -> str:
"""Parse dab page and rewrite links.""" """Parse dab page and rewrite links."""
dab_html = get_article_html(title) root = lxml.html.fromstring(html)
root = lxml.html.fromstring(dab_html)
delete_toc(root) delete_toc(root)
element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")} element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")}
@ -160,6 +159,7 @@ class Article:
self.dab_lookup: dict[int, str] = {} self.dab_lookup: dict[int, str] = {}
self.dab_order: list[str] = [] self.dab_order: list[str] = []
self.parse: Optional[dict[str, Any]] = None self.parse: Optional[dict[str, Any]] = None
self.dab_html: dict[str, str] = {}
def save_endpoint(self) -> str: def save_endpoint(self) -> str:
"""Endpoint for saving changes.""" """Endpoint for saving changes."""
@ -173,28 +173,25 @@ class Article:
def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]: def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]:
"""Disambiguation links that need fixing.""" """Disambiguation links that need fixing."""
seen = set()
for a in self.root.findall(".//a[@href]"): for a in self.root.findall(".//a[@href]"):
title = a.get("title") title = a.get("title")
if title is None or title not in self.links: if title is None or title not in self.links:
continue continue
a.set("class", "disambig")
if title in seen:
continue
seen.add(title)
yield a, title yield a, title
def process_links(self) -> None: def process_links(self) -> None:
"""Process links in parsed wikitext.""" """Process links in parsed wikitext."""
for dab_num, (a, title) in enumerate(self.iter_links()): for dab_num, (a, title) in enumerate(self.iter_links()):
a.set("class", "disambig")
a.set("id", f"dab-{dab_num}") a.set("id", f"dab-{dab_num}")
if title not in self.dab_html:
self.dab_html[title] = get_article_html(title)
dab: DabItem = { dab: DabItem = {
"num": dab_num, "num": dab_num,
"title": title, "title": title,
"html": get_dab_html(dab_num, title), "html": get_dab_html(dab_num, self.dab_html[title]),
} }
self.dab_list.append(dab) self.dab_list.append(dab)
self.dab_order.append(title) self.dab_order.append(title)