Improve link matching to avoid more classes of bad edits
- Skip no-parameter templates (navboxes) and add annotated link, excerpt, main, see to the list of skipped parameterised templates - Preserve sentence-initial capitalisation when replacement is lowercase - Skip matches that sit entirely inside an existing [[link]] destination - Treat link destinations that start with q as more specific links to preserve, in both find_link_in_chunk and find_link_and_section Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
4fe0acc167
commit
fe89db11bd
1 changed files with 49 additions and 10 deletions
|
|
@ -78,7 +78,8 @@ re_cite = re.compile(
|
||||||
re.I | re.S,
|
re.I | re.S,
|
||||||
)
|
)
|
||||||
|
|
||||||
re_cite_template_start = re.compile(r"\{\{(?:cite|citation|short description|gli|defn)\b", re.I)
|
re_cite_template_start = re.compile(r"\{\{(?:cite|citation|short description|gli|defn|annotated link|excerpt|main|see)\b", re.I)
|
||||||
|
re_no_param_template = re.compile(r"\{\{[^|{}]+\}\}")
|
||||||
re_external_link = re.compile(r"\[https?://[^\]]+\]")
|
re_external_link = re.compile(r"\[https?://[^\]]+\]")
|
||||||
# Italic text (work titles in bibliographies). Handles apostrophes in content
|
# Italic text (work titles in bibliographies). Handles apostrophes in content
|
||||||
# (e.g. ''It's fine'') but requires at least one non-apostrophe character so
|
# (e.g. ''It's fine'') but requires at least one non-apostrophe character so
|
||||||
|
|
@ -89,7 +90,7 @@ re_bullet_with_url = re.compile(r"^\*[^\n]*https?://[^\s\n]+[^\n]*", re.MULTILIN
|
||||||
|
|
||||||
|
|
||||||
def find_cite_template_spans(text: str) -> list[tuple[int, int]]:
|
def find_cite_template_spans(text: str) -> list[tuple[int, int]]:
|
||||||
"""Find (start, end) spans of {{Cite ...}} templates, handling nested braces."""
|
"""Find (start, end) spans of {{Cite ...}} and similar templates, handling nested braces."""
|
||||||
spans: list[tuple[int, int]] = []
|
spans: list[tuple[int, int]] = []
|
||||||
for m in re_cite_template_start.finditer(text):
|
for m in re_cite_template_start.finditer(text):
|
||||||
start = m.start()
|
start = m.start()
|
||||||
|
|
@ -116,6 +117,7 @@ def parse_cite(text: str) -> typing.Iterator[tuple[str, str]]:
|
||||||
"""Parse citations yielding (type, chunk) tuples, skipping ref tags, cite templates, and external links."""
|
"""Parse citations yielding (type, chunk) tuples, skipping ref tags, cite templates, and external links."""
|
||||||
regions = [(m.start(), m.end()) for m in re_cite.finditer(text)]
|
regions = [(m.start(), m.end()) for m in re_cite.finditer(text)]
|
||||||
regions.extend(find_cite_template_spans(text))
|
regions.extend(find_cite_template_spans(text))
|
||||||
|
regions.extend((m.start(), m.end()) for m in re_no_param_template.finditer(text))
|
||||||
regions.extend((m.start(), m.end()) for m in re_external_link.finditer(text))
|
regions.extend((m.start(), m.end()) for m in re_external_link.finditer(text))
|
||||||
regions.extend((m.start(), m.end()) for m in re_italic.finditer(text))
|
regions.extend((m.start(), m.end()) for m in re_italic.finditer(text))
|
||||||
regions.extend((m.start(), m.end()) for m in re_bullet_with_url.finditer(text))
|
regions.extend((m.start(), m.end()) for m in re_bullet_with_url.finditer(text))
|
||||||
|
|
@ -198,6 +200,10 @@ def match_found(m: re.Match[str], q: str, linkto: str | None) -> str:
|
||||||
else:
|
else:
|
||||||
replacement = m.group(1) + q[1:]
|
replacement = m.group(1) + q[1:]
|
||||||
assert replacement
|
assert replacement
|
||||||
|
if m.group(1).isupper() and replacement[0].islower():
|
||||||
|
pos = m.start()
|
||||||
|
if pos == 0 or m.string[pos - 1] == "\n":
|
||||||
|
replacement = replacement[0].upper() + replacement[1:]
|
||||||
if linkto:
|
if linkto:
|
||||||
if linkto[0].isupper() and replacement[0] == linkto[0].lower():
|
if linkto[0].isupper() and replacement[0] == linkto[0].lower():
|
||||||
linkto = linkto[0].lower() + linkto[1:]
|
linkto = linkto[0].lower() + linkto[1:]
|
||||||
|
|
@ -291,6 +297,21 @@ def match_spans_existing_link(m: re.Match[str], text: str) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def match_is_inside_existing_link(m: re.Match[str], text: str) -> bool:
|
||||||
|
"""Return True if the match sits entirely within an existing [[link]] span.
|
||||||
|
|
||||||
|
Catches matches in link destinations, e.g. finding 'Post-fascism' inside
|
||||||
|
[[Post-fascism in Italy|post-fascist]] and trying to wrap it again.
|
||||||
|
Matches that span a ]] boundary are left for add_link's cross-link logic.
|
||||||
|
"""
|
||||||
|
if "]]" in m.group(0):
|
||||||
|
return False
|
||||||
|
for link_m in re_link_in_text.finditer(text):
|
||||||
|
if link_m.start() <= m.start() and m.end() <= link_m.end():
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def is_part_of_named_entity(m: re.Match[str], text: str) -> bool:
|
def is_part_of_named_entity(m: re.Match[str], text: str) -> bool:
|
||||||
"""Return True if the match is a title-case fragment of a longer named entity.
|
"""Return True if the match is a title-case fragment of a longer named entity.
|
||||||
|
|
||||||
|
|
@ -339,7 +360,10 @@ def find_link_in_chunk(
|
||||||
bad_link_match = (
|
bad_link_match = (
|
||||||
link_dest
|
link_dest
|
||||||
and len(link_dest) > len(q)
|
and len(link_dest) > len(q)
|
||||||
and (lc_alpha_q not in lc_alpha(link_dest))
|
and (
|
||||||
|
lc_alpha_q not in lc_alpha(link_dest)
|
||||||
|
or lc_alpha(link_dest).startswith(lc_alpha_q)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
if not link_dest:
|
if not link_dest:
|
||||||
if q in link_text and len(link_text) > len(q):
|
if q in link_text and len(link_text) > len(q):
|
||||||
|
|
@ -364,7 +388,7 @@ def find_link_in_chunk(
|
||||||
raise LinkReplace
|
raise LinkReplace
|
||||||
masked = re_category_link.sub(lambda c: " " * len(c.group(0)), content)
|
masked = re_category_link.sub(lambda c: " " * len(c.group(0)), content)
|
||||||
m = search_for_link(masked)
|
m = search_for_link(masked)
|
||||||
if m and not is_part_of_named_entity(m, content) and not match_spans_existing_link(m, content):
|
if m and not is_part_of_named_entity(m, content) and not match_spans_existing_link(m, content) and not match_is_inside_existing_link(m, content):
|
||||||
found_text_to_link = m.group(0)
|
found_text_to_link = m.group(0)
|
||||||
replacement = match_found(m, q, linkto)
|
replacement = match_found(m, q, linkto)
|
||||||
new_content = add_link(m, replacement, content)
|
new_content = add_link(m, replacement, content)
|
||||||
|
|
@ -444,6 +468,21 @@ def find_link_and_section(q: str, content: str, linkto: str | None = None):
|
||||||
link_dest = None
|
link_dest = None
|
||||||
m = search_for_link(link_text)
|
m = search_for_link(link_text)
|
||||||
if m:
|
if m:
|
||||||
|
lc_alpha_q = lc_alpha(q)
|
||||||
|
bad = link_dest and len(link_dest) > len(q) and (
|
||||||
|
lc_alpha_q not in lc_alpha(link_dest)
|
||||||
|
or lc_alpha(link_dest).startswith(lc_alpha_q)
|
||||||
|
)
|
||||||
|
if not bad and not link_dest:
|
||||||
|
bad = len(link_text) > len(q) and lc_alpha_q in lc_alpha(link_text) and lc_alpha(link_text).startswith(lc_alpha_q)
|
||||||
|
if bad and link_dest:
|
||||||
|
try:
|
||||||
|
redirect = get_wiki_info(link_dest)
|
||||||
|
except MissingPage:
|
||||||
|
redirect = None
|
||||||
|
if redirect and lc_alpha(redirect) == lc_alpha_q:
|
||||||
|
bad = False
|
||||||
|
if not bad:
|
||||||
if link_dest:
|
if link_dest:
|
||||||
found["link_dest"] = link_dest
|
found["link_dest"] = link_dest
|
||||||
found["link_text"] = link_text
|
found["link_text"] = link_text
|
||||||
|
|
@ -455,7 +494,7 @@ def find_link_and_section(q: str, content: str, linkto: str | None = None):
|
||||||
else:
|
else:
|
||||||
masked = re_category_link.sub(lambda c: " " * len(c.group(0)), text)
|
masked = re_category_link.sub(lambda c: " " * len(c.group(0)), text)
|
||||||
m = search_for_link(masked)
|
m = search_for_link(masked)
|
||||||
if m and not is_part_of_named_entity(m, text) and not match_spans_existing_link(m, text):
|
if m and not is_part_of_named_entity(m, text) and not match_spans_existing_link(m, text) and not match_is_inside_existing_link(m, text):
|
||||||
replacement = match_found(m, q, linkto)
|
replacement = match_found(m, q, linkto)
|
||||||
text = add_link(m, replacement, text)
|
text = add_link(m, replacement, text)
|
||||||
new_content += text
|
new_content += text
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue