diff --git a/add_links/match.py b/add_links/match.py index cc18fdc..5cd4d21 100644 --- a/add_links/match.py +++ b/add_links/match.py @@ -8,6 +8,12 @@ from .core import get_case_from_content, get_content_and_timestamp, get_revision from .util import is_title_case, lc_alpha re_link_in_text = re.compile(r"\[\[[^]]+?\]\]", re.I | re.S) +re_category_link = re.compile(r"\[\[Category:[^\]]+\]\]", re.I) + +# Matches when extra words separate the match from a following (ABBREVIATION), +# indicating the matched text is just part of a longer named entity. +# e.g. "of Russia (AGMR)" matches; " (AGMR)" does not (no intervening words). +re_named_entity_abbrev = re.compile(r"^(?:\s+[^\s(]+){1,6}\s*\([A-Z]{2,}\)") class LinkReplace(Exception): @@ -72,14 +78,62 @@ re_cite = re.compile( re.I | re.S, ) +re_cite_template_start = re.compile(r"\{\{(?:cite|citation|short description|gli|defn)\b", re.I) +re_external_link = re.compile(r"\[https?://[^\]]+\]") +# Italic text (work titles in bibliographies). Handles apostrophes in content +# (e.g. ''It's fine'') but requires at least one non-apostrophe character so +# that ''' bold ''' is not consumed as italic. +re_italic = re.compile(r"''[^']+(?:'[^']+)*''") +# Bullet-point lines that contain a bare URL are unformatted bibliography entries. +re_bullet_with_url = re.compile(r"^\*[^\n]*https?://[^\s\n]+[^\n]*", re.MULTILINE) + + +def find_cite_template_spans(text: str) -> list[tuple[int, int]]: + """Find (start, end) spans of {{Cite ...}} templates, handling nested braces.""" + spans: list[tuple[int, int]] = [] + for m in re_cite_template_start.finditer(text): + start = m.start() + if any(s <= start < e for s, e in spans): + continue # already inside a found span + depth = 0 + i = start + while i < len(text): + if text[i : i + 2] == "{{": + depth += 1 + i += 2 + elif text[i : i + 2] == "}}": + depth -= 1 + i += 2 + if depth == 0: + spans.append((start, i)) + break + else: + i += 1 + return spans + def parse_cite(text: str) -> typing.Iterator[tuple[str, str]]: - """Parse a citation template.""" + """Parse citations yielding (type, chunk) tuples, skipping ref tags, cite templates, and external links.""" + regions = [(m.start(), m.end()) for m in re_cite.finditer(text)] + regions.extend(find_cite_template_spans(text)) + regions.extend((m.start(), m.end()) for m in re_external_link.finditer(text)) + regions.extend((m.start(), m.end()) for m in re_italic.finditer(text)) + regions.extend((m.start(), m.end()) for m in re_bullet_with_url.finditer(text)) + regions.sort() + + # Merge overlapping regions (e.g. a {{Cite}} that sits inside a ) + merged: list[tuple[int, int]] = [] + for start, end in regions: + if merged and start < merged[-1][1]: + merged[-1] = (merged[-1][0], max(merged[-1][1], end)) + else: + merged.append((start, end)) + prev = 0 - for m in re_cite.finditer(text): - yield ("text", text[prev : m.start()]) - yield ("cite", m.group(0)) - prev = m.end() + for start, end in merged: + yield ("text", text[prev:start]) + yield ("cite", text[start:end]) + prev = end yield ("text", text[prev:]) @@ -163,6 +217,8 @@ def parse_links(text: str) -> typing.Iterator[tuple[str, str]]: for prefix in ("file:", "image:") ): yield ("image", m.group(0)) + elif m.group().lower().startswith("[[category:"): + yield ("category", m.group(0)) else: yield ("link", m.group(0)) prev = m.end() @@ -186,12 +242,66 @@ def mk_link_matcher(q: str) -> typing.Callable[[str], re.Match[str] | None]: def add_link(m: re.Match[str], replacement: str, text: str) -> str: """Add link to text.""" - matched_text = m.group(0) if matched_text.startswith("[[") and matched_text.endswith("|"): return m.re.sub(lambda m: f"[[{replacement}|", text, count=1) - else: - return m.re.sub(lambda m: f"[[{replacement}]]", text, count=1) + + inner_bracket = matched_text.find("[[") + if inner_bracket > 0: + prefix = matched_text[:inner_bracket].rstrip() + sep = matched_text[len(prefix) : inner_bracket] + suffix = matched_text[inner_bracket:] + link_dest = replacement.split("|")[0] if "|" in replacement else replacement + if text[m.end() : m.end() + 2] == "]]": + # The existing [[link]] is fully consumed by the match (its closing ]] + # immediately follows). Replace everything with a single clean link. + # e.g. "surface [[runoff (hydrology)|runoff]]" → "[[surface runoff]]" + return text[: m.start()] + f"[[{link_dest}]]" + text[m.end() + 2 :] + # The existing link is only partially consumed; keep it and link just the prefix. + # e.g. "cross-platform [[interchange station]]" + # → "[[cross-platform interchange|cross-platform]] [[interchange station]]" + return text[: m.start()] + f"[[{link_dest}|{prefix}]]{sep}{suffix}" + text[m.end() :] + + close_bracket = matched_text.find("]]") + if close_bracket > 0 and m.start() >= 2 and text[m.start() - 2 : m.start()] == "[[": + # Match started inside an existing [[link]] and spans beyond its closing ]]. + # e.g. "[[anti-globalization]] movement" matched as "anti-globalization]] movement" + # → absorb the leading [[ and produce "[[anti-globalization movement]]" + return text[: m.start() - 2] + f"[[{replacement}]]" + text[m.end() :] + + return m.re.sub(lambda m: f"[[{replacement}]]", text, count=1) + + +def match_spans_existing_link(m: re.Match[str], text: str) -> bool: + """Return True if the match starts partway inside an existing [[link]]. + + The trans2 space pattern allows matching across ]] boundaries, which is + intentional for cases like [[anti-globalization]] movement (match starts + right after [[). But when the match starts *inside* a link (e.g. + [[impervious surface]] runoff matching 'surface runoff'), absorbing the + brackets would produce broken wikitext — skip those matches instead. + """ + close_bracket = m.group(0).find("]]") + if close_bracket <= 0: + return False + # close_bracket > 0: match spans ][. If it starts right after [[ that + # opening bracket is absorbed by add_link — not our problem here. + if m.start() >= 2 and text[m.start() - 2 : m.start()] == "[[": + return False + return True + + +def is_part_of_named_entity(m: re.Match[str], text: str) -> bool: + """Return True if the match is a title-case fragment of a longer named entity. + + Detects the pattern: matched text (title case) + extra words + (ABBREVIATION). + e.g. "Anti-Globalization Movement" followed by "of Russia (AGMR)" → True. + But "Anti-Globalization Movement" followed directly by "(AGM)" → False, + since the match itself is the full entity name. + """ + if not m.group(0)[0].isupper(): + return False + return bool(re_named_entity_abbrev.match(text[m.end() :])) def find_link_in_chunk( @@ -252,8 +362,9 @@ def find_link_in_chunk( if not replacement: if bad_link_match: raise LinkReplace - m = search_for_link(content) - if m: + masked = re_category_link.sub(lambda c: " " * len(c.group(0)), content) + m = search_for_link(masked) + if m and not is_part_of_named_entity(m, content) and not match_spans_existing_link(m, content): found_text_to_link = m.group(0) replacement = match_found(m, q, linkto) new_content = add_link(m, replacement, content) @@ -342,8 +453,9 @@ def find_link_and_section(q: str, content: str, linkto: str | None = None): if replacement: text = new_text else: - m = search_for_link(text) - if m: + masked = re_category_link.sub(lambda c: " " * len(c.group(0)), text) + m = search_for_link(masked) + if m and not is_part_of_named_entity(m, text) and not match_spans_existing_link(m, text): replacement = match_found(m, q, linkto) text = add_link(m, replacement, text) new_content += text