Improve link matching to avoid many classes of bad edits
parse_cite: extend to skip {{cite}}/{{citation}}, {{short description}},
{{gli}}, {{defn}}, external links [https://...], italic text ''...'',
and bullet-point lines containing bare URLs (unformatted bibliography
entries). Uses brace-counting to handle nested templates correctly.
parse_links: yield [[Category:...]] links as 'category' tokens so they
are never modified.
add_link: handle three new boundary cases where the match spans an
existing [[link]]:
- match ends exactly at the link boundary: replace the whole thing with
a single clean link (e.g. surface [[runoff (hydrology)|runoff]] →
[[surface runoff]])
- match starts right after [[: absorb the stray [[ (e.g.
[[anti-globalization]] movement → [[anti-globalization movement]])
- match starts partway inside a link: skip (would produce broken wikitext)
- match spans into but not through a link: use a piped prefix link
(e.g. cross-platform [[interchange station]] →
[[cross-platform interchange|cross-platform]] [[interchange station]])
Fallback search: mask [[Category:...]] spans with spaces so the pattern
cannot match inside them. Guard against matches that are part of a
longer named entity (title-case phrase followed by extra words then an
abbreviation in parentheses, e.g. "Anti-Globalization Movement of
Russia (AGMR)").
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
95ca5f755d
commit
4fe0acc167
1 changed files with 124 additions and 12 deletions
|
|
@ -8,6 +8,12 @@ from .core import get_case_from_content, get_content_and_timestamp, get_revision
|
|||
from .util import is_title_case, lc_alpha
|
||||
|
||||
re_link_in_text = re.compile(r"\[\[[^]]+?\]\]", re.I | re.S)
|
||||
re_category_link = re.compile(r"\[\[Category:[^\]]+\]\]", re.I)
|
||||
|
||||
# Matches when extra words separate the match from a following (ABBREVIATION),
|
||||
# indicating the matched text is just part of a longer named entity.
|
||||
# e.g. "of Russia (AGMR)" matches; " (AGMR)" does not (no intervening words).
|
||||
re_named_entity_abbrev = re.compile(r"^(?:\s+[^\s(]+){1,6}\s*\([A-Z]{2,}\)")
|
||||
|
||||
|
||||
class LinkReplace(Exception):
|
||||
|
|
@ -72,14 +78,62 @@ re_cite = re.compile(
|
|||
re.I | re.S,
|
||||
)
|
||||
|
||||
re_cite_template_start = re.compile(r"\{\{(?:cite|citation|short description|gli|defn)\b", re.I)
|
||||
re_external_link = re.compile(r"\[https?://[^\]]+\]")
|
||||
# Italic text (work titles in bibliographies). Handles apostrophes in content
|
||||
# (e.g. ''It's fine'') but requires at least one non-apostrophe character so
|
||||
# that ''' bold ''' is not consumed as italic.
|
||||
re_italic = re.compile(r"''[^']+(?:'[^']+)*''")
|
||||
# Bullet-point lines that contain a bare URL are unformatted bibliography entries.
|
||||
re_bullet_with_url = re.compile(r"^\*[^\n]*https?://[^\s\n]+[^\n]*", re.MULTILINE)
|
||||
|
||||
|
||||
def find_cite_template_spans(text: str) -> list[tuple[int, int]]:
|
||||
"""Find (start, end) spans of {{Cite ...}} templates, handling nested braces."""
|
||||
spans: list[tuple[int, int]] = []
|
||||
for m in re_cite_template_start.finditer(text):
|
||||
start = m.start()
|
||||
if any(s <= start < e for s, e in spans):
|
||||
continue # already inside a found span
|
||||
depth = 0
|
||||
i = start
|
||||
while i < len(text):
|
||||
if text[i : i + 2] == "{{":
|
||||
depth += 1
|
||||
i += 2
|
||||
elif text[i : i + 2] == "}}":
|
||||
depth -= 1
|
||||
i += 2
|
||||
if depth == 0:
|
||||
spans.append((start, i))
|
||||
break
|
||||
else:
|
||||
i += 1
|
||||
return spans
|
||||
|
||||
|
||||
def parse_cite(text: str) -> typing.Iterator[tuple[str, str]]:
|
||||
"""Parse a citation template."""
|
||||
"""Parse citations yielding (type, chunk) tuples, skipping ref tags, cite templates, and external links."""
|
||||
regions = [(m.start(), m.end()) for m in re_cite.finditer(text)]
|
||||
regions.extend(find_cite_template_spans(text))
|
||||
regions.extend((m.start(), m.end()) for m in re_external_link.finditer(text))
|
||||
regions.extend((m.start(), m.end()) for m in re_italic.finditer(text))
|
||||
regions.extend((m.start(), m.end()) for m in re_bullet_with_url.finditer(text))
|
||||
regions.sort()
|
||||
|
||||
# Merge overlapping regions (e.g. a {{Cite}} that sits inside a <ref>)
|
||||
merged: list[tuple[int, int]] = []
|
||||
for start, end in regions:
|
||||
if merged and start < merged[-1][1]:
|
||||
merged[-1] = (merged[-1][0], max(merged[-1][1], end))
|
||||
else:
|
||||
merged.append((start, end))
|
||||
|
||||
prev = 0
|
||||
for m in re_cite.finditer(text):
|
||||
yield ("text", text[prev : m.start()])
|
||||
yield ("cite", m.group(0))
|
||||
prev = m.end()
|
||||
for start, end in merged:
|
||||
yield ("text", text[prev:start])
|
||||
yield ("cite", text[start:end])
|
||||
prev = end
|
||||
yield ("text", text[prev:])
|
||||
|
||||
|
||||
|
|
@ -163,6 +217,8 @@ def parse_links(text: str) -> typing.Iterator[tuple[str, str]]:
|
|||
for prefix in ("file:", "image:")
|
||||
):
|
||||
yield ("image", m.group(0))
|
||||
elif m.group().lower().startswith("[[category:"):
|
||||
yield ("category", m.group(0))
|
||||
else:
|
||||
yield ("link", m.group(0))
|
||||
prev = m.end()
|
||||
|
|
@ -186,12 +242,66 @@ def mk_link_matcher(q: str) -> typing.Callable[[str], re.Match[str] | None]:
|
|||
|
||||
def add_link(m: re.Match[str], replacement: str, text: str) -> str:
|
||||
"""Add link to text."""
|
||||
|
||||
matched_text = m.group(0)
|
||||
if matched_text.startswith("[[") and matched_text.endswith("|"):
|
||||
return m.re.sub(lambda m: f"[[{replacement}|", text, count=1)
|
||||
else:
|
||||
return m.re.sub(lambda m: f"[[{replacement}]]", text, count=1)
|
||||
|
||||
inner_bracket = matched_text.find("[[")
|
||||
if inner_bracket > 0:
|
||||
prefix = matched_text[:inner_bracket].rstrip()
|
||||
sep = matched_text[len(prefix) : inner_bracket]
|
||||
suffix = matched_text[inner_bracket:]
|
||||
link_dest = replacement.split("|")[0] if "|" in replacement else replacement
|
||||
if text[m.end() : m.end() + 2] == "]]":
|
||||
# The existing [[link]] is fully consumed by the match (its closing ]]
|
||||
# immediately follows). Replace everything with a single clean link.
|
||||
# e.g. "surface [[runoff (hydrology)|runoff]]" → "[[surface runoff]]"
|
||||
return text[: m.start()] + f"[[{link_dest}]]" + text[m.end() + 2 :]
|
||||
# The existing link is only partially consumed; keep it and link just the prefix.
|
||||
# e.g. "cross-platform [[interchange station]]"
|
||||
# → "[[cross-platform interchange|cross-platform]] [[interchange station]]"
|
||||
return text[: m.start()] + f"[[{link_dest}|{prefix}]]{sep}{suffix}" + text[m.end() :]
|
||||
|
||||
close_bracket = matched_text.find("]]")
|
||||
if close_bracket > 0 and m.start() >= 2 and text[m.start() - 2 : m.start()] == "[[":
|
||||
# Match started inside an existing [[link]] and spans beyond its closing ]].
|
||||
# e.g. "[[anti-globalization]] movement" matched as "anti-globalization]] movement"
|
||||
# → absorb the leading [[ and produce "[[anti-globalization movement]]"
|
||||
return text[: m.start() - 2] + f"[[{replacement}]]" + text[m.end() :]
|
||||
|
||||
return m.re.sub(lambda m: f"[[{replacement}]]", text, count=1)
|
||||
|
||||
|
||||
def match_spans_existing_link(m: re.Match[str], text: str) -> bool:
|
||||
"""Return True if the match starts partway inside an existing [[link]].
|
||||
|
||||
The trans2 space pattern allows matching across ]] boundaries, which is
|
||||
intentional for cases like [[anti-globalization]] movement (match starts
|
||||
right after [[). But when the match starts *inside* a link (e.g.
|
||||
[[impervious surface]] runoff matching 'surface runoff'), absorbing the
|
||||
brackets would produce broken wikitext — skip those matches instead.
|
||||
"""
|
||||
close_bracket = m.group(0).find("]]")
|
||||
if close_bracket <= 0:
|
||||
return False
|
||||
# close_bracket > 0: match spans ][. If it starts right after [[ that
|
||||
# opening bracket is absorbed by add_link — not our problem here.
|
||||
if m.start() >= 2 and text[m.start() - 2 : m.start()] == "[[":
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def is_part_of_named_entity(m: re.Match[str], text: str) -> bool:
|
||||
"""Return True if the match is a title-case fragment of a longer named entity.
|
||||
|
||||
Detects the pattern: matched text (title case) + extra words + (ABBREVIATION).
|
||||
e.g. "Anti-Globalization Movement" followed by "of Russia (AGMR)" → True.
|
||||
But "Anti-Globalization Movement" followed directly by "(AGM)" → False,
|
||||
since the match itself is the full entity name.
|
||||
"""
|
||||
if not m.group(0)[0].isupper():
|
||||
return False
|
||||
return bool(re_named_entity_abbrev.match(text[m.end() :]))
|
||||
|
||||
|
||||
def find_link_in_chunk(
|
||||
|
|
@ -252,8 +362,9 @@ def find_link_in_chunk(
|
|||
if not replacement:
|
||||
if bad_link_match:
|
||||
raise LinkReplace
|
||||
m = search_for_link(content)
|
||||
if m:
|
||||
masked = re_category_link.sub(lambda c: " " * len(c.group(0)), content)
|
||||
m = search_for_link(masked)
|
||||
if m and not is_part_of_named_entity(m, content) and not match_spans_existing_link(m, content):
|
||||
found_text_to_link = m.group(0)
|
||||
replacement = match_found(m, q, linkto)
|
||||
new_content = add_link(m, replacement, content)
|
||||
|
|
@ -342,8 +453,9 @@ def find_link_and_section(q: str, content: str, linkto: str | None = None):
|
|||
if replacement:
|
||||
text = new_text
|
||||
else:
|
||||
m = search_for_link(text)
|
||||
if m:
|
||||
masked = re_category_link.sub(lambda c: " " * len(c.group(0)), text)
|
||||
m = search_for_link(masked)
|
||||
if m and not is_part_of_named_entity(m, text) and not match_spans_existing_link(m, text):
|
||||
replacement = match_found(m, q, linkto)
|
||||
text = add_link(m, replacement, text)
|
||||
new_content += text
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue