from __future__ import unicode_literals import re import typing from .api import MissingPage, call_get_diff, get_wiki_info from .core import get_case_from_content, get_content_and_timestamp, get_revision_info from .util import is_title_case, lc_alpha re_link_in_text = re.compile(r"\[\[[^]]+?\]\]", re.I | re.S) re_category_link = re.compile(r"\[\[Category:[^\]]+\]\]", re.I) # Matches when extra words separate the match from a following (ABBREVIATION), # indicating the matched text is just part of a longer named entity. # e.g. "of Russia (AGMR)" matches; " (AGMR)" does not (no intervening words). re_named_entity_abbrev = re.compile(r"^(?:\s+[^\s(]+){1,6}\s*\([A-Z]{2,}\)") class LinkReplace(Exception): """Replaces and existing link.""" en_dash = "\u2013" trans = {",": ",?", " ": " *[-\n]? *"} trans[en_dash] = trans[" "] trans2 = {" ": r"('?s?\]\])?'?s? ?(\[\[(?:.+\|)?)?", "-": "[- ]"} trans2[en_dash] = trans2[" "] patterns = [ lambda q: re.compile( r"(?]*?)?>\s*({{cite.*?}}|\[https?://[^]]*?\])\s*", re.I | re.S r"]*?)?>.*", re.I | re.S, ) re_cite_template_start = re.compile(r"\{\{(?:cite|citation|short description|gli|defn|annotated link|excerpt|main|see|for)\b", re.I) re_no_param_template = re.compile(r"\{\{[^|{}]+\}\}") re_external_link = re.compile(r"\[https?://[^\]]+\]") # Italic text (work titles in bibliographies). Handles apostrophes in content # (e.g. ''It's fine'') but requires at least one non-apostrophe character so # that ''' bold ''' is not consumed as italic. re_italic = re.compile(r"''[^']+(?:'[^']+)*''") # Bullet-point lines that contain a bare URL are unformatted bibliography entries. re_bullet_with_url = re.compile(r"^\*[^\n]*https?://[^\s\n]+[^\n]*", re.MULTILINE) def find_cite_template_spans(text: str) -> list[tuple[int, int]]: """Find (start, end) spans of {{Cite ...}} and similar templates, handling nested braces.""" spans: list[tuple[int, int]] = [] for m in re_cite_template_start.finditer(text): start = m.start() if any(s <= start < e for s, e in spans): continue # already inside a found span depth = 0 i = start while i < len(text): if text[i : i + 2] == "{{": depth += 1 i += 2 elif text[i : i + 2] == "}}": depth -= 1 i += 2 if depth == 0: spans.append((start, i)) break else: i += 1 return spans def parse_cite(text: str) -> typing.Iterator[tuple[str, str]]: """Parse citations yielding (type, chunk) tuples, skipping ref tags, cite templates, and external links.""" regions = [(m.start(), m.end()) for m in re_cite.finditer(text)] regions.extend(find_cite_template_spans(text)) regions.extend((m.start(), m.end()) for m in re_no_param_template.finditer(text)) regions.extend((m.start(), m.end()) for m in re_external_link.finditer(text)) regions.extend((m.start(), m.end()) for m in re_italic.finditer(text)) regions.extend((m.start(), m.end()) for m in re_bullet_with_url.finditer(text)) regions.sort() # Merge overlapping regions (e.g. a {{Cite}} that sits inside a ) merged: list[tuple[int, int]] = [] for start, end in regions: if merged and start < merged[-1][1]: merged[-1] = (merged[-1][0], max(merged[-1][1], end)) else: merged.append((start, end)) prev = 0 for start, end in merged: yield ("text", text[prev:start]) yield ("cite", text[start:end]) prev = end yield ("text", text[prev:]) re_heading = re.compile(r"^\s*(=+)\s*(.+)\s*\1(|\s)*$") def section_iter(text: str) -> typing.Iterator[tuple[str | None, str]]: """Iterate sections yielding tuples of heading and section text.""" cur_section = "" heading = None in_comment = False for line in text.splitlines(True): if "" in line: in_comment = False m = re_heading.match(line) if in_comment or not m: cur_section += line continue if cur_section or heading: yield (heading, cur_section) heading = m.group() cur_section = "" continue yield (heading, cur_section) def get_subsections(text: str, section_num: int) -> str: """Retrieve the text of subsections for a given section number within an article.""" found = "" collection_level = None for num, (heading, body) in enumerate(section_iter(text)): if heading is None: level = 0 else: m = re_heading.match(heading) assert m level = len(m.group(1)) if num == section_num: collection_level = level continue if collection_level: if level > collection_level: assert heading found += heading + body else: break return found def match_found(m: re.Match[str], q: str, linkto: str | None) -> str: if q[1:] == m.group(0)[1:]: replacement = m.group(1) + q[1:] elif any(c.isupper() for c in q[1:]) or m.group(0) == m.group(0).upper(): replacement = q elif is_title_case(m.group(0)): replacement = None replacement = get_case_from_content(q) if replacement is None: replacement = q.lower() else: replacement = m.group(1) + q[1:] assert replacement if m.group(1).isupper() and replacement[0].islower(): pos = m.start() if pos == 0 or m.string[pos - 1] == "\n": replacement = replacement[0].upper() + replacement[1:] if linkto: if ( linkto[0].isupper() and replacement[0].islower() and not is_title_case(replacement) ): linkto = linkto[0].lower() + linkto[1:] elif replacement[0].isupper(): linkto = linkto[0].upper() + linkto[1:] replacement = linkto + "|" + replacement return replacement def parse_links(text: str) -> typing.Iterator[tuple[str, str]]: prev = 0 for m in re_link_in_text.finditer(text): if prev != m.start(): yield ("text", text[prev : m.start()]) if any( m.group().lower().startswith("[[" + prefix) for prefix in ("file:", "image:") ): yield ("image", m.group(0)) elif m.group().lower().startswith("[[category:"): yield ("category", m.group(0)) else: yield ("link", m.group(0)) prev = m.end() if prev < len(text): yield ("text", text[prev:]) def mk_link_matcher(q: str) -> typing.Callable[[str], re.Match[str] | None]: """Make link matcher.""" re_links = [p(q) for p in patterns] def search_for_link(text: str) -> re.Match[str] | None: for re_link in re_links: m = re_link.search(text) if m and m.group(0).count("[[") < 4: return m return None return search_for_link def add_link(m: re.Match[str], replacement: str, text: str) -> str: """Add link to text.""" matched_text = m.group(0) if matched_text.startswith("[[") and matched_text.endswith("|"): return m.re.sub(lambda m: f"[[{replacement}|", text, count=1) split_links = matched_text.find("]] [[") if split_links > 0 and m.start() >= 2 and text[m.start() - 2 : m.start()] == "[[": # Match starts inside one link and continues into the next opening link. # Link only the text from the first link span and leave the second link as-is. link_dest = replacement.split("|")[0] if "|" in replacement else replacement visible = matched_text[:split_links] return text[: m.start() - 2] + f"[[{link_dest}|{visible}]]" + text[m.start() + split_links + 2 :] inner_bracket = matched_text.find("[[") if inner_bracket > 0: prefix = matched_text[:inner_bracket].rstrip() sep = matched_text[len(prefix) : inner_bracket] suffix = matched_text[inner_bracket:] link_dest = replacement.split("|")[0] if "|" in replacement else replacement if text[m.end() : m.end() + 2] == "]]": # The existing [[link]] is fully consumed by the match (its closing ]] # immediately follows). Replace everything with a single clean link. # e.g. "surface [[runoff (hydrology)|runoff]]" → "[[surface runoff]]" return text[: m.start()] + f"[[{link_dest}]]" + text[m.end() + 2 :] # The existing link is only partially consumed; keep it and link just the prefix. # e.g. "cross-platform [[interchange station]]" # → "[[cross-platform interchange|cross-platform]] [[interchange station]]" return text[: m.start()] + f"[[{link_dest}|{prefix}]]{sep}{suffix}" + text[m.end() :] close_bracket = matched_text.find("]]") if close_bracket > 0 and m.start() >= 2 and text[m.start() - 2 : m.start()] == "[[": # Match started inside an existing [[link]] and spans beyond its closing ]]. # e.g. "[[anti-globalization]] movement" matched as "anti-globalization]] movement" # → absorb the leading [[ and produce "[[anti-globalization movement]]" return text[: m.start() - 2] + f"[[{replacement}]]" + text[m.end() :] return m.re.sub(lambda m: f"[[{replacement}]]", text, count=1) def match_spans_existing_link(m: re.Match[str], text: str) -> bool: """Return True if the match starts partway inside an existing [[link]]. The trans2 space pattern allows matching across ]] boundaries, which is intentional for cases like [[anti-globalization]] movement (match starts right after [[). But when the match starts *inside* a link (e.g. [[impervious surface]] runoff matching 'surface runoff'), absorbing the brackets would produce broken wikitext — skip those matches instead. """ close_bracket = m.group(0).find("]]") if close_bracket <= 0: return False # close_bracket > 0: match spans ][. If it starts right after [[ that # opening bracket is absorbed by add_link — not our problem here. if m.start() >= 2 and text[m.start() - 2 : m.start()] == "[[": return False return True def match_is_inside_existing_link(m: re.Match[str], text: str) -> bool: """Return True if the match sits entirely within an existing [[link]] span. Catches matches in link destinations, e.g. finding 'Post-fascism' inside [[Post-fascism in Italy|post-fascist]] and trying to wrap it again. Matches that span a ]] boundary are left for add_link's cross-link logic. """ if "]]" in m.group(0): return False for link_m in re_link_in_text.finditer(text): if link_m.start() <= m.start() and m.end() <= link_m.end(): return True return False def is_part_of_named_entity(m: re.Match[str], text: str) -> bool: """Return True if the match is a title-case fragment of a longer named entity. Detects the pattern: matched text (title case) + extra words + (ABBREVIATION). e.g. "Anti-Globalization Movement" followed by "of Russia (AGMR)" → True. But "Anti-Globalization Movement" followed directly by "(AGM)" → False, since the match itself is the full entity name. """ if not m.group(0)[0].isupper(): return False return bool(re_named_entity_abbrev.match(text[m.end() :])) def find_link_in_chunk( q: str, content: str, linkto: str | None = None ) -> tuple[str, str | None, str | None]: """Find link in chunk.""" search_for_link = mk_link_matcher(q) new_content = "" replacement = None match_in_non_link = False bad_link_match = False found_text_to_link = None for token_type, text in parse_links(content): if token_type == "text": if search_for_link(text): match_in_non_link = True elif token_type == "image": before, sep, link_text = text[:-2].rpartition("|") m = search_for_link(link_text) if m: found_text_to_link = m.group(0) replacement = match_found(m, q, linkto) text = before + sep + add_link(m, replacement, link_text) + "]]" elif token_type == "link" and not replacement and not match_in_non_link: link_text = text[2:-2] link_dest = None if "|" in link_text: link_dest, link_text = link_text.split("|", 1) m = search_for_link(link_text) if m and (not link_dest or not link_dest.startswith("#")): lc_alpha_q = lc_alpha(q) bad_link_match = ( link_dest and len(link_dest) > len(q) and ( lc_alpha_q not in lc_alpha(link_dest) or lc_alpha(link_dest).startswith(lc_alpha_q) ) ) if not link_dest: if q in link_text and len(link_text) > len(q): bad_link_match = True if bad_link_match and link_dest: try: link_dest_redirect = get_wiki_info(link_dest) except MissingPage: link_dest_redirect = None if ( link_dest_redirect and lc_alpha(link_dest_redirect) == lc_alpha_q ): bad_link_match = False if not bad_link_match: replacement = match_found(m, q, linkto) found_text_to_link = m.group(0) text = add_link(m, replacement, link_text) new_content += text if not replacement: if bad_link_match: raise LinkReplace masked = re_category_link.sub(lambda c: " " * len(c.group(0)), content) m = search_for_link(masked) if m and not is_part_of_named_entity(m, content) and not match_spans_existing_link(m, content) and not match_is_inside_existing_link(m, content): found_text_to_link = m.group(0) replacement = match_found(m, q, linkto) new_content = add_link(m, replacement, content) if linkto: m_end = m.end() re_extend = re.compile(m.re.pattern + r"\w*\b", re.I) m = re_extend.search(content) if m and m.end() > m_end: replacement += content[m_end : m.end()] new_content = add_link(m, replacement, content) return (new_content, replacement, found_text_to_link) def find_link_in_text(q: str, content: str) -> tuple[str, str]: (new_content, replacement) = find_link_in_chunk(q, content) if replacement: return (new_content, replacement) raise NoMatch def find_link_in_content(q, content, linkto=None): if linkto: try: return find_link_in_content(linkto, content) except NoMatch: pass replacement = None new_content = "" link_replace = False for header, section_text in section_iter(content): if header: new_content += header for token_type, text in parse_cite(section_text): if token_type == "text" and not replacement: try: (new_text, replacement, replaced_text) = find_link_in_chunk( q, text, linkto=linkto ) except LinkReplace: link_replace = True if replacement: text = new_text new_content += text if replacement: return (new_content, replacement, replaced_text) raise LinkReplace if link_replace else NoMatch def find_link_and_section(q: str, content: str, linkto: str | None = None): if linkto: try: return find_link_and_section(linkto, content) except NoMatch: pass sections = list(section_iter(content)) replacement = None search_for_link = mk_link_matcher(q) found: dict[str, str | int] = {} for section_num, (header, section_text) in enumerate(sections): new_content = "" if header: new_content += header for token_type, text in parse_cite(section_text): if token_type != "text" or replacement: new_content += text continue new_text = "" for token_type2, text2 in parse_links(text): if token_type2 == "link" and not replacement: link_text = text2[2:-2] if "|" in link_text: link_dest, link_text = link_text.split("|", 1) else: link_dest = None m = search_for_link(link_text) if m: lc_alpha_q = lc_alpha(q) bad = link_dest and len(link_dest) > len(q) and ( lc_alpha_q not in lc_alpha(link_dest) or lc_alpha(link_dest).startswith(lc_alpha_q) ) if not bad and not link_dest: bad = len(link_text) > len(q) and lc_alpha_q in lc_alpha(link_text) and lc_alpha(link_text).startswith(lc_alpha_q) if bad and link_dest: try: redirect = get_wiki_info(link_dest) except MissingPage: redirect = None if redirect and lc_alpha(redirect) == lc_alpha_q: bad = False if not bad: if link_dest: found["link_dest"] = link_dest found["link_text"] = link_text replacement = match_found(m, q, None) text2 = add_link(m, replacement, link_text) new_text += text2 if replacement: text = new_text else: masked = re_category_link.sub(lambda c: " " * len(c.group(0)), text) m = search_for_link(masked) if m and not is_part_of_named_entity(m, text) and not match_spans_existing_link(m, text) and not match_is_inside_existing_link(m, text): replacement = match_found(m, q, linkto) text = add_link(m, replacement, text) new_content += text if replacement: found.update( { "section_num": section_num, "section_text": new_content, "old_text": (header or "") + section_text, "replacement": replacement, } ) return found raise NoMatch def find_refs(text: str) -> list[str]: """Find in wikitext.""" refs = re.findall("]*)>(.+?)", text) return refs def new_link_is_in_ref(replacement: str, text: str) -> bool: """Is the new link in a .""" link = f"[[{replacement}]]" return any(link in ref for ref in find_refs(text)) def get_match(q: str, title: str, linkto: str | None) -> dict[str, typing.Any]: """Get match.""" rev = get_revision_info(title) found: dict[str, typing.Any] = find_link_and_section(q, rev["content"], linkto) assert not new_link_is_in_ref(found["replacement"], found["section_text"]) found["revid"] = rev["revid"] found["pageid"] = rev["pageid"] found["section_text"] += get_subsections(rev["content"], found["section_num"]) return found def get_diff(q: str, title: str, linkto: str | None) -> dict[str, typing.Any]: """Get diff.""" content, timestamp = get_content_and_timestamp(title) found: dict[str, typing.Any] = find_link_and_section(q, content, linkto) if new_link_is_in_ref(found["replacement"], found["section_text"]): raise NoMatch section_text = found["section_text"] + get_subsections( content, found["section_num"] ) found["diff"] = call_get_diff(title, found["section_num"], section_text) if not found["diff"]: raise NoMatch return found