- Record skips, saves, and no-match results in session["skipped"] so revisiting an article resumes past already-checked candidates - Filter self-links (case-insensitive first letter) from hit list - Use OAuth session for all API reads when logged in for higher rate limits - Add "for" template to exclusion list to avoid bad edits - Improve API error handling with HTTP status codes logged to stderr Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
564 lines
21 KiB
Python
564 lines
21 KiB
Python
from __future__ import unicode_literals
|
|
|
|
import re
|
|
import typing
|
|
|
|
from .api import MissingPage, call_get_diff, get_wiki_info
|
|
from .core import get_case_from_content, get_content_and_timestamp, get_revision_info
|
|
from .util import is_title_case, lc_alpha
|
|
|
|
re_link_in_text = re.compile(r"\[\[[^]]+?\]\]", re.I | re.S)
|
|
re_category_link = re.compile(r"\[\[Category:[^\]]+\]\]", re.I)
|
|
|
|
# Matches when extra words separate the match from a following (ABBREVIATION),
|
|
# indicating the matched text is just part of a longer named entity.
|
|
# e.g. "of Russia (AGMR)" matches; " (AGMR)" does not (no intervening words).
|
|
re_named_entity_abbrev = re.compile(r"^(?:\s+[^\s(]+){1,6}\s*\([A-Z]{2,}\)")
|
|
|
|
|
|
class LinkReplace(Exception):
|
|
"""Replaces and existing link."""
|
|
|
|
|
|
en_dash = "\u2013"
|
|
trans = {",": ",?", " ": " *[-\n]? *"}
|
|
trans[en_dash] = trans[" "]
|
|
|
|
trans2 = {" ": r"('?s?\]\])?'?s? ?(\[\[(?:.+\|)?)?", "-": "[- ]"}
|
|
trans2[en_dash] = trans2[" "]
|
|
|
|
patterns = [
|
|
lambda q: re.compile(
|
|
r"(?<!-)\[\[(%s)%s\|(?=.*\]\])"
|
|
% (
|
|
re.escape(q[0]),
|
|
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
|
|
),
|
|
re.I,
|
|
),
|
|
lambda q: re.compile(
|
|
r"(?<!-)\[\[(?:(?!File:)(?:[^]]+\|)?)(%s)%s\]\]"
|
|
% (
|
|
re.escape(q[0]),
|
|
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
|
|
),
|
|
re.I,
|
|
),
|
|
lambda q: re.compile(
|
|
r"(?<!-)\[\[(?!File:)[^|]+\|(%s)%s\]\]" % (re.escape(q[0]), re.escape(q[1:])),
|
|
re.I,
|
|
),
|
|
lambda q: re.compile(
|
|
r"(?<!-)\[\[(?!File:)[^|]+\|(%s)%s(?:\]\])?"
|
|
% (
|
|
re.escape(q[0]),
|
|
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
|
|
),
|
|
re.I,
|
|
),
|
|
lambda q: re.compile(r"(?<!-)(%s)%s" % (re.escape(q[0]), re.escape(q[1:])), re.I),
|
|
lambda q: re.compile(
|
|
r"(?<!-)(%s)%s"
|
|
% (
|
|
re.escape(q[0]),
|
|
"".join((trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
|
|
),
|
|
re.I,
|
|
),
|
|
]
|
|
|
|
|
|
class NoMatch(Exception):
|
|
"""No match."""
|
|
|
|
|
|
re_cite = re.compile(
|
|
# r"<ref( [^>]*?)?>\s*({{cite.*?}}|\[https?://[^]]*?\])\s*</ref>", re.I | re.S
|
|
r"<ref( [^>]*?)?>.*</ref>",
|
|
re.I | re.S,
|
|
)
|
|
|
|
re_cite_template_start = re.compile(r"\{\{(?:cite|citation|short description|gli|defn|annotated link|excerpt|main|see|for)\b", re.I)
|
|
re_no_param_template = re.compile(r"\{\{[^|{}]+\}\}")
|
|
re_external_link = re.compile(r"\[https?://[^\]]+\]")
|
|
# Italic text (work titles in bibliographies). Handles apostrophes in content
|
|
# (e.g. ''It's fine'') but requires at least one non-apostrophe character so
|
|
# that ''' bold ''' is not consumed as italic.
|
|
re_italic = re.compile(r"''[^']+(?:'[^']+)*''")
|
|
# Bullet-point lines that contain a bare URL are unformatted bibliography entries.
|
|
re_bullet_with_url = re.compile(r"^\*[^\n]*https?://[^\s\n]+[^\n]*", re.MULTILINE)
|
|
|
|
|
|
def find_cite_template_spans(text: str) -> list[tuple[int, int]]:
|
|
"""Find (start, end) spans of {{Cite ...}} and similar templates, handling nested braces."""
|
|
spans: list[tuple[int, int]] = []
|
|
for m in re_cite_template_start.finditer(text):
|
|
start = m.start()
|
|
if any(s <= start < e for s, e in spans):
|
|
continue # already inside a found span
|
|
depth = 0
|
|
i = start
|
|
while i < len(text):
|
|
if text[i : i + 2] == "{{":
|
|
depth += 1
|
|
i += 2
|
|
elif text[i : i + 2] == "}}":
|
|
depth -= 1
|
|
i += 2
|
|
if depth == 0:
|
|
spans.append((start, i))
|
|
break
|
|
else:
|
|
i += 1
|
|
return spans
|
|
|
|
|
|
def parse_cite(text: str) -> typing.Iterator[tuple[str, str]]:
|
|
"""Parse citations yielding (type, chunk) tuples, skipping ref tags, cite templates, and external links."""
|
|
regions = [(m.start(), m.end()) for m in re_cite.finditer(text)]
|
|
regions.extend(find_cite_template_spans(text))
|
|
regions.extend((m.start(), m.end()) for m in re_no_param_template.finditer(text))
|
|
regions.extend((m.start(), m.end()) for m in re_external_link.finditer(text))
|
|
regions.extend((m.start(), m.end()) for m in re_italic.finditer(text))
|
|
regions.extend((m.start(), m.end()) for m in re_bullet_with_url.finditer(text))
|
|
regions.sort()
|
|
|
|
# Merge overlapping regions (e.g. a {{Cite}} that sits inside a <ref>)
|
|
merged: list[tuple[int, int]] = []
|
|
for start, end in regions:
|
|
if merged and start < merged[-1][1]:
|
|
merged[-1] = (merged[-1][0], max(merged[-1][1], end))
|
|
else:
|
|
merged.append((start, end))
|
|
|
|
prev = 0
|
|
for start, end in merged:
|
|
yield ("text", text[prev:start])
|
|
yield ("cite", text[start:end])
|
|
prev = end
|
|
yield ("text", text[prev:])
|
|
|
|
|
|
re_heading = re.compile(r"^\s*(=+)\s*(.+)\s*\1(<!--.*-->|\s)*$")
|
|
|
|
|
|
def section_iter(text: str) -> typing.Iterator[tuple[str | None, str]]:
|
|
"""Iterate sections yielding tuples of heading and section text."""
|
|
cur_section = ""
|
|
heading = None
|
|
in_comment = False
|
|
for line in text.splitlines(True):
|
|
if "<!--" in line:
|
|
in_comment = True
|
|
if "-->" in line:
|
|
in_comment = False
|
|
m = re_heading.match(line)
|
|
if in_comment or not m:
|
|
cur_section += line
|
|
continue
|
|
if cur_section or heading:
|
|
yield (heading, cur_section)
|
|
heading = m.group()
|
|
cur_section = ""
|
|
continue
|
|
yield (heading, cur_section)
|
|
|
|
|
|
def get_subsections(text: str, section_num: int) -> str:
|
|
"""Retrieve the text of subsections for a given section number within an article."""
|
|
found = ""
|
|
collection_level = None
|
|
for num, (heading, body) in enumerate(section_iter(text)):
|
|
if heading is None:
|
|
level = 0
|
|
else:
|
|
m = re_heading.match(heading)
|
|
assert m
|
|
level = len(m.group(1))
|
|
if num == section_num:
|
|
collection_level = level
|
|
continue
|
|
if collection_level:
|
|
if level > collection_level:
|
|
assert heading
|
|
found += heading + body
|
|
else:
|
|
break
|
|
return found
|
|
|
|
|
|
def match_found(m: re.Match[str], q: str, linkto: str | None) -> str:
|
|
if q[1:] == m.group(0)[1:]:
|
|
replacement = m.group(1) + q[1:]
|
|
elif any(c.isupper() for c in q[1:]) or m.group(0) == m.group(0).upper():
|
|
replacement = q
|
|
elif is_title_case(m.group(0)):
|
|
replacement = None
|
|
replacement = get_case_from_content(q)
|
|
if replacement is None:
|
|
replacement = q.lower()
|
|
else:
|
|
replacement = m.group(1) + q[1:]
|
|
assert replacement
|
|
if m.group(1).isupper() and replacement[0].islower():
|
|
pos = m.start()
|
|
if pos == 0 or m.string[pos - 1] == "\n":
|
|
replacement = replacement[0].upper() + replacement[1:]
|
|
if linkto:
|
|
if linkto[0].isupper() and replacement[0] == linkto[0].lower():
|
|
linkto = linkto[0].lower() + linkto[1:]
|
|
elif replacement[0].isupper():
|
|
linkto = linkto[0].upper() + linkto[1:]
|
|
replacement = linkto + "|" + replacement
|
|
return replacement
|
|
|
|
|
|
def parse_links(text: str) -> typing.Iterator[tuple[str, str]]:
|
|
prev = 0
|
|
for m in re_link_in_text.finditer(text):
|
|
if prev != m.start():
|
|
yield ("text", text[prev : m.start()])
|
|
if any(
|
|
m.group().lower().startswith("[[" + prefix)
|
|
for prefix in ("file:", "image:")
|
|
):
|
|
yield ("image", m.group(0))
|
|
elif m.group().lower().startswith("[[category:"):
|
|
yield ("category", m.group(0))
|
|
else:
|
|
yield ("link", m.group(0))
|
|
prev = m.end()
|
|
if prev < len(text):
|
|
yield ("text", text[prev:])
|
|
|
|
|
|
def mk_link_matcher(q: str) -> typing.Callable[[str], re.Match[str] | None]:
|
|
"""Make link matcher."""
|
|
re_links = [p(q) for p in patterns]
|
|
|
|
def search_for_link(text: str) -> re.Match[str] | None:
|
|
for re_link in re_links:
|
|
m = re_link.search(text)
|
|
if m and m.group(0).count("[[") < 4:
|
|
return m
|
|
return None
|
|
|
|
return search_for_link
|
|
|
|
|
|
def add_link(m: re.Match[str], replacement: str, text: str) -> str:
|
|
"""Add link to text."""
|
|
matched_text = m.group(0)
|
|
if matched_text.startswith("[[") and matched_text.endswith("|"):
|
|
return m.re.sub(lambda m: f"[[{replacement}|", text, count=1)
|
|
|
|
split_links = matched_text.find("]] [[")
|
|
if split_links > 0 and m.start() >= 2 and text[m.start() - 2 : m.start()] == "[[":
|
|
# Match starts inside one link and continues into the next opening link.
|
|
# Link only the text from the first link span and leave the second link as-is.
|
|
link_dest = replacement.split("|")[0] if "|" in replacement else replacement
|
|
visible = matched_text[:split_links]
|
|
return text[: m.start() - 2] + f"[[{link_dest}|{visible}]]" + text[m.start() + split_links + 2 :]
|
|
|
|
inner_bracket = matched_text.find("[[")
|
|
if inner_bracket > 0:
|
|
prefix = matched_text[:inner_bracket].rstrip()
|
|
sep = matched_text[len(prefix) : inner_bracket]
|
|
suffix = matched_text[inner_bracket:]
|
|
link_dest = replacement.split("|")[0] if "|" in replacement else replacement
|
|
if text[m.end() : m.end() + 2] == "]]":
|
|
# The existing [[link]] is fully consumed by the match (its closing ]]
|
|
# immediately follows). Replace everything with a single clean link.
|
|
# e.g. "surface [[runoff (hydrology)|runoff]]" → "[[surface runoff]]"
|
|
return text[: m.start()] + f"[[{link_dest}]]" + text[m.end() + 2 :]
|
|
# The existing link is only partially consumed; keep it and link just the prefix.
|
|
# e.g. "cross-platform [[interchange station]]"
|
|
# → "[[cross-platform interchange|cross-platform]] [[interchange station]]"
|
|
return text[: m.start()] + f"[[{link_dest}|{prefix}]]{sep}{suffix}" + text[m.end() :]
|
|
|
|
close_bracket = matched_text.find("]]")
|
|
if close_bracket > 0 and m.start() >= 2 and text[m.start() - 2 : m.start()] == "[[":
|
|
# Match started inside an existing [[link]] and spans beyond its closing ]].
|
|
# e.g. "[[anti-globalization]] movement" matched as "anti-globalization]] movement"
|
|
# → absorb the leading [[ and produce "[[anti-globalization movement]]"
|
|
return text[: m.start() - 2] + f"[[{replacement}]]" + text[m.end() :]
|
|
|
|
return m.re.sub(lambda m: f"[[{replacement}]]", text, count=1)
|
|
|
|
|
|
def match_spans_existing_link(m: re.Match[str], text: str) -> bool:
|
|
"""Return True if the match starts partway inside an existing [[link]].
|
|
|
|
The trans2 space pattern allows matching across ]] boundaries, which is
|
|
intentional for cases like [[anti-globalization]] movement (match starts
|
|
right after [[). But when the match starts *inside* a link (e.g.
|
|
[[impervious surface]] runoff matching 'surface runoff'), absorbing the
|
|
brackets would produce broken wikitext — skip those matches instead.
|
|
"""
|
|
close_bracket = m.group(0).find("]]")
|
|
if close_bracket <= 0:
|
|
return False
|
|
# close_bracket > 0: match spans ][. If it starts right after [[ that
|
|
# opening bracket is absorbed by add_link — not our problem here.
|
|
if m.start() >= 2 and text[m.start() - 2 : m.start()] == "[[":
|
|
return False
|
|
return True
|
|
|
|
|
|
def match_is_inside_existing_link(m: re.Match[str], text: str) -> bool:
|
|
"""Return True if the match sits entirely within an existing [[link]] span.
|
|
|
|
Catches matches in link destinations, e.g. finding 'Post-fascism' inside
|
|
[[Post-fascism in Italy|post-fascist]] and trying to wrap it again.
|
|
Matches that span a ]] boundary are left for add_link's cross-link logic.
|
|
"""
|
|
if "]]" in m.group(0):
|
|
return False
|
|
for link_m in re_link_in_text.finditer(text):
|
|
if link_m.start() <= m.start() and m.end() <= link_m.end():
|
|
return True
|
|
return False
|
|
|
|
|
|
def is_part_of_named_entity(m: re.Match[str], text: str) -> bool:
|
|
"""Return True if the match is a title-case fragment of a longer named entity.
|
|
|
|
Detects the pattern: matched text (title case) + extra words + (ABBREVIATION).
|
|
e.g. "Anti-Globalization Movement" followed by "of Russia (AGMR)" → True.
|
|
But "Anti-Globalization Movement" followed directly by "(AGM)" → False,
|
|
since the match itself is the full entity name.
|
|
"""
|
|
if not m.group(0)[0].isupper():
|
|
return False
|
|
return bool(re_named_entity_abbrev.match(text[m.end() :]))
|
|
|
|
|
|
def find_link_in_chunk(
|
|
q: str, content: str, linkto: str | None = None
|
|
) -> tuple[str, str | None, str | None]:
|
|
"""Find link in chunk."""
|
|
search_for_link = mk_link_matcher(q)
|
|
new_content = ""
|
|
replacement = None
|
|
|
|
match_in_non_link = False
|
|
bad_link_match = False
|
|
found_text_to_link = None
|
|
|
|
for token_type, text in parse_links(content):
|
|
if token_type == "text":
|
|
if search_for_link(text):
|
|
match_in_non_link = True
|
|
elif token_type == "image":
|
|
before, sep, link_text = text[:-2].rpartition("|")
|
|
m = search_for_link(link_text)
|
|
if m:
|
|
found_text_to_link = m.group(0)
|
|
replacement = match_found(m, q, linkto)
|
|
text = before + sep + add_link(m, replacement, link_text) + "]]"
|
|
elif token_type == "link" and not replacement and not match_in_non_link:
|
|
link_text = text[2:-2]
|
|
link_dest = None
|
|
if "|" in link_text:
|
|
link_dest, link_text = link_text.split("|", 1)
|
|
m = search_for_link(link_text)
|
|
if m and (not link_dest or not link_dest.startswith("#")):
|
|
lc_alpha_q = lc_alpha(q)
|
|
|
|
bad_link_match = (
|
|
link_dest
|
|
and len(link_dest) > len(q)
|
|
and (
|
|
lc_alpha_q not in lc_alpha(link_dest)
|
|
or lc_alpha(link_dest).startswith(lc_alpha_q)
|
|
)
|
|
)
|
|
if not link_dest:
|
|
if q in link_text and len(link_text) > len(q):
|
|
bad_link_match = True
|
|
if bad_link_match and link_dest:
|
|
try:
|
|
link_dest_redirect = get_wiki_info(link_dest)
|
|
except MissingPage:
|
|
link_dest_redirect = None
|
|
if (
|
|
link_dest_redirect
|
|
and lc_alpha(link_dest_redirect) == lc_alpha_q
|
|
):
|
|
bad_link_match = False
|
|
if not bad_link_match:
|
|
replacement = match_found(m, q, linkto)
|
|
found_text_to_link = m.group(0)
|
|
text = add_link(m, replacement, link_text)
|
|
new_content += text
|
|
if not replacement:
|
|
if bad_link_match:
|
|
raise LinkReplace
|
|
masked = re_category_link.sub(lambda c: " " * len(c.group(0)), content)
|
|
m = search_for_link(masked)
|
|
if m and not is_part_of_named_entity(m, content) and not match_spans_existing_link(m, content) and not match_is_inside_existing_link(m, content):
|
|
found_text_to_link = m.group(0)
|
|
replacement = match_found(m, q, linkto)
|
|
new_content = add_link(m, replacement, content)
|
|
if linkto:
|
|
m_end = m.end()
|
|
re_extend = re.compile(m.re.pattern + r"\w*\b", re.I)
|
|
m = re_extend.search(content)
|
|
if m and m.end() > m_end:
|
|
replacement += content[m_end : m.end()]
|
|
new_content = add_link(m, replacement, content)
|
|
return (new_content, replacement, found_text_to_link)
|
|
|
|
|
|
def find_link_in_text(q: str, content: str) -> tuple[str, str]:
|
|
(new_content, replacement) = find_link_in_chunk(q, content)
|
|
if replacement:
|
|
return (new_content, replacement)
|
|
raise NoMatch
|
|
|
|
|
|
def find_link_in_content(q, content, linkto=None):
|
|
if linkto:
|
|
try:
|
|
return find_link_in_content(linkto, content)
|
|
except NoMatch:
|
|
pass
|
|
replacement = None
|
|
new_content = ""
|
|
link_replace = False
|
|
for header, section_text in section_iter(content):
|
|
if header:
|
|
new_content += header
|
|
for token_type, text in parse_cite(section_text):
|
|
if token_type == "text" and not replacement:
|
|
try:
|
|
(new_text, replacement, replaced_text) = find_link_in_chunk(
|
|
q, text, linkto=linkto
|
|
)
|
|
except LinkReplace:
|
|
link_replace = True
|
|
if replacement:
|
|
text = new_text
|
|
new_content += text
|
|
if replacement:
|
|
return (new_content, replacement, replaced_text)
|
|
raise LinkReplace if link_replace else NoMatch
|
|
|
|
|
|
def find_link_and_section(q: str, content: str, linkto: str | None = None):
|
|
if linkto:
|
|
try:
|
|
return find_link_and_section(linkto, content)
|
|
except NoMatch:
|
|
pass
|
|
sections = list(section_iter(content))
|
|
replacement = None
|
|
|
|
search_for_link = mk_link_matcher(q)
|
|
|
|
found: dict[str, str | int] = {}
|
|
|
|
for section_num, (header, section_text) in enumerate(sections):
|
|
new_content = ""
|
|
if header:
|
|
new_content += header
|
|
for token_type, text in parse_cite(section_text):
|
|
if token_type != "text" or replacement:
|
|
new_content += text
|
|
continue
|
|
new_text = ""
|
|
for token_type2, text2 in parse_links(text):
|
|
if token_type2 == "link" and not replacement:
|
|
link_text = text2[2:-2]
|
|
if "|" in link_text:
|
|
link_dest, link_text = link_text.split("|", 1)
|
|
else:
|
|
link_dest = None
|
|
m = search_for_link(link_text)
|
|
if m:
|
|
lc_alpha_q = lc_alpha(q)
|
|
bad = link_dest and len(link_dest) > len(q) and (
|
|
lc_alpha_q not in lc_alpha(link_dest)
|
|
or lc_alpha(link_dest).startswith(lc_alpha_q)
|
|
)
|
|
if not bad and not link_dest:
|
|
bad = len(link_text) > len(q) and lc_alpha_q in lc_alpha(link_text) and lc_alpha(link_text).startswith(lc_alpha_q)
|
|
if bad and link_dest:
|
|
try:
|
|
redirect = get_wiki_info(link_dest)
|
|
except MissingPage:
|
|
redirect = None
|
|
if redirect and lc_alpha(redirect) == lc_alpha_q:
|
|
bad = False
|
|
if not bad:
|
|
if link_dest:
|
|
found["link_dest"] = link_dest
|
|
found["link_text"] = link_text
|
|
replacement = match_found(m, q, None)
|
|
text2 = add_link(m, replacement, link_text)
|
|
new_text += text2
|
|
if replacement:
|
|
text = new_text
|
|
else:
|
|
masked = re_category_link.sub(lambda c: " " * len(c.group(0)), text)
|
|
m = search_for_link(masked)
|
|
if m and not is_part_of_named_entity(m, text) and not match_spans_existing_link(m, text) and not match_is_inside_existing_link(m, text):
|
|
replacement = match_found(m, q, linkto)
|
|
text = add_link(m, replacement, text)
|
|
new_content += text
|
|
if replacement:
|
|
found.update(
|
|
{
|
|
"section_num": section_num,
|
|
"section_text": new_content,
|
|
"old_text": (header or "") + section_text,
|
|
"replacement": replacement,
|
|
}
|
|
)
|
|
return found
|
|
raise NoMatch
|
|
|
|
|
|
def find_refs(text: str) -> list[str]:
|
|
"""Find <ref> in wikitext."""
|
|
refs = re.findall("<ref(?:[^>]*)>(.+?)</ref>", text)
|
|
return refs
|
|
|
|
|
|
def new_link_is_in_ref(replacement: str, text: str) -> bool:
|
|
"""Is the new link in a <ref>."""
|
|
link = f"[[{replacement}]]"
|
|
return any(link in ref for ref in find_refs(text))
|
|
|
|
|
|
def get_match(q: str, title: str, linkto: str | None) -> dict[str, typing.Any]:
|
|
"""Get match."""
|
|
rev = get_revision_info(title)
|
|
|
|
found: dict[str, typing.Any] = find_link_and_section(q, rev["content"], linkto)
|
|
|
|
assert not new_link_is_in_ref(found["replacement"], found["section_text"])
|
|
|
|
found["revid"] = rev["revid"]
|
|
found["pageid"] = rev["pageid"]
|
|
found["section_text"] += get_subsections(rev["content"], found["section_num"])
|
|
|
|
return found
|
|
|
|
|
|
def get_diff(q: str, title: str, linkto: str | None) -> dict[str, typing.Any]:
|
|
"""Get diff."""
|
|
content, timestamp = get_content_and_timestamp(title)
|
|
found: dict[str, typing.Any] = find_link_and_section(q, content, linkto)
|
|
|
|
if new_link_is_in_ref(found["replacement"], found["section_text"]):
|
|
raise NoMatch
|
|
|
|
section_text = found["section_text"] + get_subsections(
|
|
content, found["section_num"]
|
|
)
|
|
|
|
found["diff"] = call_get_diff(title, found["section_num"], section_text)
|
|
if not found["diff"]:
|
|
raise NoMatch
|
|
return found
|