diff --git a/add_links/match.py b/add_links/match.py index 5cd4d21..9d00404 100644 --- a/add_links/match.py +++ b/add_links/match.py @@ -78,7 +78,8 @@ re_cite = re.compile( re.I | re.S, ) -re_cite_template_start = re.compile(r"\{\{(?:cite|citation|short description|gli|defn)\b", re.I) +re_cite_template_start = re.compile(r"\{\{(?:cite|citation|short description|gli|defn|annotated link|excerpt|main|see)\b", re.I) +re_no_param_template = re.compile(r"\{\{[^|{}]+\}\}") re_external_link = re.compile(r"\[https?://[^\]]+\]") # Italic text (work titles in bibliographies). Handles apostrophes in content # (e.g. ''It's fine'') but requires at least one non-apostrophe character so @@ -89,7 +90,7 @@ re_bullet_with_url = re.compile(r"^\*[^\n]*https?://[^\s\n]+[^\n]*", re.MULTILIN def find_cite_template_spans(text: str) -> list[tuple[int, int]]: - """Find (start, end) spans of {{Cite ...}} templates, handling nested braces.""" + """Find (start, end) spans of {{Cite ...}} and similar templates, handling nested braces.""" spans: list[tuple[int, int]] = [] for m in re_cite_template_start.finditer(text): start = m.start() @@ -116,6 +117,7 @@ def parse_cite(text: str) -> typing.Iterator[tuple[str, str]]: """Parse citations yielding (type, chunk) tuples, skipping ref tags, cite templates, and external links.""" regions = [(m.start(), m.end()) for m in re_cite.finditer(text)] regions.extend(find_cite_template_spans(text)) + regions.extend((m.start(), m.end()) for m in re_no_param_template.finditer(text)) regions.extend((m.start(), m.end()) for m in re_external_link.finditer(text)) regions.extend((m.start(), m.end()) for m in re_italic.finditer(text)) regions.extend((m.start(), m.end()) for m in re_bullet_with_url.finditer(text)) @@ -198,6 +200,10 @@ def match_found(m: re.Match[str], q: str, linkto: str | None) -> str: else: replacement = m.group(1) + q[1:] assert replacement + if m.group(1).isupper() and replacement[0].islower(): + pos = m.start() + if pos == 0 or m.string[pos - 1] == "\n": + replacement = replacement[0].upper() + replacement[1:] if linkto: if linkto[0].isupper() and replacement[0] == linkto[0].lower(): linkto = linkto[0].lower() + linkto[1:] @@ -291,6 +297,21 @@ def match_spans_existing_link(m: re.Match[str], text: str) -> bool: return True +def match_is_inside_existing_link(m: re.Match[str], text: str) -> bool: + """Return True if the match sits entirely within an existing [[link]] span. + + Catches matches in link destinations, e.g. finding 'Post-fascism' inside + [[Post-fascism in Italy|post-fascist]] and trying to wrap it again. + Matches that span a ]] boundary are left for add_link's cross-link logic. + """ + if "]]" in m.group(0): + return False + for link_m in re_link_in_text.finditer(text): + if link_m.start() <= m.start() and m.end() <= link_m.end(): + return True + return False + + def is_part_of_named_entity(m: re.Match[str], text: str) -> bool: """Return True if the match is a title-case fragment of a longer named entity. @@ -339,7 +360,10 @@ def find_link_in_chunk( bad_link_match = ( link_dest and len(link_dest) > len(q) - and (lc_alpha_q not in lc_alpha(link_dest)) + and ( + lc_alpha_q not in lc_alpha(link_dest) + or lc_alpha(link_dest).startswith(lc_alpha_q) + ) ) if not link_dest: if q in link_text and len(link_text) > len(q): @@ -364,7 +388,7 @@ def find_link_in_chunk( raise LinkReplace masked = re_category_link.sub(lambda c: " " * len(c.group(0)), content) m = search_for_link(masked) - if m and not is_part_of_named_entity(m, content) and not match_spans_existing_link(m, content): + if m and not is_part_of_named_entity(m, content) and not match_spans_existing_link(m, content) and not match_is_inside_existing_link(m, content): found_text_to_link = m.group(0) replacement = match_found(m, q, linkto) new_content = add_link(m, replacement, content) @@ -444,18 +468,33 @@ def find_link_and_section(q: str, content: str, linkto: str | None = None): link_dest = None m = search_for_link(link_text) if m: - if link_dest: - found["link_dest"] = link_dest - found["link_text"] = link_text - replacement = match_found(m, q, None) - text2 = add_link(m, replacement, link_text) + lc_alpha_q = lc_alpha(q) + bad = link_dest and len(link_dest) > len(q) and ( + lc_alpha_q not in lc_alpha(link_dest) + or lc_alpha(link_dest).startswith(lc_alpha_q) + ) + if not bad and not link_dest: + bad = len(link_text) > len(q) and lc_alpha_q in lc_alpha(link_text) and lc_alpha(link_text).startswith(lc_alpha_q) + if bad and link_dest: + try: + redirect = get_wiki_info(link_dest) + except MissingPage: + redirect = None + if redirect and lc_alpha(redirect) == lc_alpha_q: + bad = False + if not bad: + if link_dest: + found["link_dest"] = link_dest + found["link_text"] = link_text + replacement = match_found(m, q, None) + text2 = add_link(m, replacement, link_text) new_text += text2 if replacement: text = new_text else: masked = re_category_link.sub(lambda c: " " * len(c.group(0)), text) m = search_for_link(masked) - if m and not is_part_of_named_entity(m, text) and not match_spans_existing_link(m, text): + if m and not is_part_of_named_entity(m, text) and not match_spans_existing_link(m, text) and not match_is_inside_existing_link(m, text): replacement = match_found(m, q, linkto) text = add_link(m, replacement, text) new_content += text