From daf2a254584b8eb79730a82b665a341f41650450 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Thu, 18 Aug 2022 20:51:18 +0100 Subject: [PATCH] Include all occurances of dab links, not just the first. --- dab_mechanic/wikipedia.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/dab_mechanic/wikipedia.py b/dab_mechanic/wikipedia.py index 57c03c4..9bcb0fa 100644 --- a/dab_mechanic/wikipedia.py +++ b/dab_mechanic/wikipedia.py @@ -121,10 +121,9 @@ def delete_toc(root: lxml.html.HtmlElement) -> None: toc.getparent().remove(toc) -def get_dab_html(dab_num: int, title: str) -> str: +def get_dab_html(dab_num: int, html: str) -> str: """Parse dab page and rewrite links.""" - dab_html = get_article_html(title) - root = lxml.html.fromstring(dab_html) + root = lxml.html.fromstring(html) delete_toc(root) element_id_map = {e.get("id"): e for e in root.findall(".//*[@id]")} @@ -160,6 +159,7 @@ class Article: self.dab_lookup: dict[int, str] = {} self.dab_order: list[str] = [] self.parse: Optional[dict[str, Any]] = None + self.dab_html: dict[str, str] = {} def save_endpoint(self) -> str: """Endpoint for saving changes.""" @@ -173,28 +173,25 @@ class Article: def iter_links(self) -> Iterator[tuple[lxml.html.Element, str]]: """Disambiguation links that need fixing.""" - seen = set() for a in self.root.findall(".//a[@href]"): title = a.get("title") if title is None or title not in self.links: continue - a.set("class", "disambig") - - if title in seen: - continue - seen.add(title) - yield a, title def process_links(self) -> None: """Process links in parsed wikitext.""" for dab_num, (a, title) in enumerate(self.iter_links()): + a.set("class", "disambig") a.set("id", f"dab-{dab_num}") + if title not in self.dab_html: + self.dab_html[title] = get_article_html(title) + dab: DabItem = { "num": dab_num, "title": title, - "html": get_dab_html(dab_num, title), + "html": get_dab_html(dab_num, self.dab_html[title]), } self.dab_list.append(dab) self.dab_order.append(title)