Compare commits

..

No commits in common. "b76a6707f2ab5601414217d0897ccd61e686d2d0" and "113dfd363026c6b6312ca7cd657c655fa3e854e8" have entirely different histories.

3 changed files with 76 additions and 102 deletions

View file

@ -11,7 +11,7 @@ re_link_in_text = re.compile(r"\[\[[^]]+?\]\]", re.I | re.S)
class LinkReplace(Exception): class LinkReplace(Exception):
"""Replaces and existing link.""" pass
en_dash = "\u2013" en_dash = "\u2013"
@ -23,7 +23,7 @@ trans2[en_dash] = trans2[" "]
patterns = [ patterns = [
lambda q: re.compile( lambda q: re.compile(
r"(?<!-)\[\[(%s)%s\|(?=.*\]\])" r"(?<!-)(?:\[\[(?:[^]]+\|)?)?(%s)%s(?:\]\])?"
% ( % (
re.escape(q[0]), re.escape(q[0]),
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]), "".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
@ -31,19 +31,10 @@ patterns = [
re.I, re.I,
), ),
lambda q: re.compile( lambda q: re.compile(
r"(?<!-)\[\[(?:(?!File:)(?:[^]]+\|)?)(%s)%s\]\]" r"(?<!-)\[\[[^|]+\|(%s)%s\]\]" % (re.escape(q[0]), re.escape(q[1:])), re.I
% (
re.escape(q[0]),
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
),
re.I,
), ),
lambda q: re.compile( lambda q: re.compile(
r"(?<!-)\[\[(?!File:)[^|]+\|(%s)%s\]\]" % (re.escape(q[0]), re.escape(q[1:])), r"(?<!-)\[\[[^|]+\|(%s)%s(?:\]\])?"
re.I,
),
lambda q: re.compile(
r"(?<!-)\[\[(?!File:)[^|]+\|(%s)%s(?:\]\])?"
% ( % (
re.escape(q[0]), re.escape(q[0]),
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]), "".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
@ -55,7 +46,7 @@ patterns = [
r"(?<!-)(%s)%s" r"(?<!-)(%s)%s"
% ( % (
re.escape(q[0]), re.escape(q[0]),
"".join((trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]), "".join((trans[c] if c in trans else re.escape(c)) for c in q[1:]),
), ),
re.I, re.I,
), ),
@ -63,13 +54,11 @@ patterns = [
class NoMatch(Exception): class NoMatch(Exception):
"""No match.""" pass
re_cite = re.compile( re_cite = re.compile(
# r"<ref( [^>]*?)?>\s*({{cite.*?}}|\[https?://[^]]*?\])\s*</ref>", re.I | re.S r"<ref( [^>]*?)?>\s*({{cite.*?}}|\[https?://[^]]*?\])\s*</ref>", re.I | re.S
r"<ref( [^>]*?)?>.*</ref>",
re.I | re.S,
) )
@ -109,7 +98,7 @@ def section_iter(text: str) -> typing.Iterator[tuple[str | None, str]]:
def get_subsections(text: str, section_num: int) -> str: def get_subsections(text: str, section_num: int) -> str:
"""Retrieve the text of subsections for a given section number within an article.""" "retrieve the text of subsections for a given section number within an article"
found = "" found = ""
collection_level = None collection_level = None
for num, (heading, body) in enumerate(section_iter(text)): for num, (heading, body) in enumerate(section_iter(text)):
@ -131,7 +120,7 @@ def get_subsections(text: str, section_num: int) -> str:
return found return found
def match_found(m: re.Match[str], q: str, linkto: str | None) -> str: def match_found(m, q, linkto):
if q[1:] == m.group(0)[1:]: if q[1:] == m.group(0)[1:]:
replacement = m.group(1) + q[1:] replacement = m.group(1) + q[1:]
elif any(c.isupper() for c in q[1:]) or m.group(0) == m.group(0).upper(): elif any(c.isupper() for c in q[1:]) or m.group(0) == m.group(0).upper():
@ -170,34 +159,23 @@ def parse_links(text: str) -> typing.Iterator[tuple[str, str]]:
yield ("text", text[prev:]) yield ("text", text[prev:])
def mk_link_matcher(q: str) -> typing.Callable[[str], re.Match[str] | None]: def mk_link_matcher(q):
"""Make link matcher."""
re_links = [p(q) for p in patterns] re_links = [p(q) for p in patterns]
def search_for_link(text: str) -> re.Match[str] | None: def search_for_link(text):
for re_link in re_links: for re_link in re_links:
m = re_link.search(text) m = re_link.search(text)
if m and m.group(0).count("[[") < 4: if m and m.group(0).count("[[") < 4:
return m return m
return None
return search_for_link return search_for_link
def add_link(m: re.Match[str], replacement: str, text: str) -> str: def add_link(m, replacement, text):
"""Add link to text.""" return m.re.sub(lambda m: "[[%s]]" % replacement, text, count=1)
matched_text = m.group(0)
if matched_text.startswith("[[") and matched_text.endswith("|"):
return m.re.sub(lambda m: f"[[{replacement}|", text, count=1)
else:
return m.re.sub(lambda m: f"[[{replacement}]]", text, count=1)
def find_link_in_chunk( def find_link_in_chunk(q, content, linkto=None):
q: str, content: str, linkto: str | None = None
) -> tuple[str, str | None, str | None]:
"""Find link in chunk."""
search_for_link = mk_link_matcher(q) search_for_link = mk_link_matcher(q)
new_content = "" new_content = ""
replacement = None replacement = None
@ -267,7 +245,7 @@ def find_link_in_chunk(
return (new_content, replacement, found_text_to_link) return (new_content, replacement, found_text_to_link)
def find_link_in_text(q: str, content: str) -> tuple[str, str]: def find_link_in_text(q, content):
(new_content, replacement) = find_link_in_chunk(q, content) (new_content, replacement) = find_link_in_chunk(q, content)
if replacement: if replacement:
return (new_content, replacement) return (new_content, replacement)
@ -302,7 +280,7 @@ def find_link_in_content(q, content, linkto=None):
raise LinkReplace if link_replace else NoMatch raise LinkReplace if link_replace else NoMatch
def find_link_and_section(q: str, content: str, linkto: str | None = None): def find_link_and_section(q, content, linkto=None):
if linkto: if linkto:
try: try:
return find_link_and_section(linkto, content) return find_link_and_section(linkto, content)
@ -320,32 +298,30 @@ def find_link_and_section(q: str, content: str, linkto: str | None = None):
if header: if header:
new_content += header new_content += header
for token_type, text in parse_cite(section_text): for token_type, text in parse_cite(section_text):
if token_type != "text" or replacement: if token_type == "text" and not replacement:
new_content += text new_text = ""
continue for token_type2, text2 in parse_links(text):
new_text = "" if token_type2 == "link" and not replacement:
for token_type2, text2 in parse_links(text): link_text = text2[2:-2]
if token_type2 == "link" and not replacement: if "|" in link_text:
link_text = text2[2:-2] link_dest, link_text = link_text.split("|", 1)
if "|" in link_text: else:
link_dest, link_text = link_text.split("|", 1) link_dest = None
else: m = search_for_link(link_text)
link_dest = None if m:
m = search_for_link(link_text) if link_dest:
found["link_dest"] = link_dest
found["link_text"] = link_text
replacement = match_found(m, q, None)
text2 = add_link(m, replacement, link_text)
new_text += text2
if replacement:
text = new_text
else:
m = search_for_link(text)
if m: if m:
if link_dest: replacement = match_found(m, q, linkto)
found["link_dest"] = link_dest text = add_link(m, replacement, text)
found["link_text"] = link_text
replacement = match_found(m, q, None)
text2 = add_link(m, replacement, link_text)
new_text += text2
if replacement:
text = new_text
else:
m = search_for_link(text)
if m:
replacement = match_found(m, q, linkto)
text = add_link(m, replacement, text)
new_content += text new_content += text
if replacement: if replacement:
found.update( found.update(
@ -362,7 +338,9 @@ def find_link_and_section(q: str, content: str, linkto: str | None = None):
def find_refs(text: str) -> list[str]: def find_refs(text: str) -> list[str]:
"""Find <ref> in wikitext.""" """Find <ref> in wikitext."""
refs = re.findall("<ref(?:[^>]*)>(.+?)</ref>", text) refs = re.findall("<ref(?:[^>]*)>(.+?)</ref>", text)
print(refs)
return refs return refs

View file

@ -47,7 +47,6 @@ def search_count_with_link(q: str) -> int:
def parse_contribs() -> list[tuple[str, int]]: def parse_contribs() -> list[tuple[str, int]]:
"""Parse user contributions."""
re_comment = re.compile(r"^link \[\[(.*)\]\] using") re_comment = re.compile(r"^link \[\[(.*)\]\] using")
links: collections.Counter[str] = collections.Counter() links: collections.Counter[str] = collections.Counter()
@ -71,48 +70,45 @@ def parse_contribs() -> list[tuple[str, int]]:
return links.most_common(200) return links.most_common(200)
def main() -> None: with open("examples") as f:
with open("examples") as f: seen = {json.loads(line)["title"] for line in f}
seen = {json.loads(line)["title"] for line in f}
out = open("examples", "a")
for from_title, num in parse_contribs():
if from_title in seen:
continue
count = search_count(from_title)
count_with_link = search_count_with_link(from_title)
ratio = float(count_with_link) / float(count)
print(from_title, count, count_with_link, f"{ratio:.1%}")
print(
json.dumps(
{"title": from_title, "total": count, "with_links": count_with_link}
),
file=out,
)
out.flush()
time.sleep(0.1)
out.close()
sys.exit(0)
out = open("examples", "a")
for from_title, num in parse_contribs():
if from_title in seen:
continue
count = search_count(from_title) count = search_count(from_title)
count_with_link = search_count_with_link(from_title) count_with_link = search_count_with_link(from_title)
ratio = float(count_with_link) / float(count) ratio = float(count_with_link) / float(count)
print(count, count_with_link, f"{ratio:.1%}") print(from_title, count, count_with_link, f"{ratio:.1%}")
print(
json.dumps(
{"title": from_title, "total": count, "with_links": count_with_link}
),
file=out,
)
out.flush()
time.sleep(0.1)
out.close()
sys.exit(0) sys.exit(0)
totalhits, search_hits = search_no_link(from_title)
for hit in search_hits:
print(" ", hit)
print(count, count_with_link, f"{ratio:.1%}", totalhits, len(search_hits))
# ret = core.do_search(from_title)
# print(ret)
if __name__ == "__main__": count = search_count(from_title)
main() count_with_link = search_count_with_link(from_title)
ratio = float(count_with_link) / float(count)
print(count, count_with_link, f"{ratio:.1%}")
sys.exit(0)
totalhits, search_hits = search_no_link(from_title)
for hit in search_hits:
print(" ", hit)
print(count, count_with_link, f"{ratio:.1%}", totalhits, len(search_hits))
# ret = core.do_search(from_title)
# print(ret)

View file

@ -289,8 +289,8 @@ def get_best_hit(title: str, hits: list[Hit]) -> tuple[Hit, dict[str, typing.Any
for hit in hits: for hit in hits:
if hit["title"].lower() == title.lower(): if hit["title"].lower() == title.lower():
continue continue
# if match_type(title, hit["snippet"]) != "exact": if match_type(title, hit["snippet"]) != "exact":
# continue continue
try: try:
print(f'get diff: {hit["title"]}, {title}') print(f'get diff: {hit["title"]}, {title}')