Compare commits
No commits in common. "b76a6707f2ab5601414217d0897ccd61e686d2d0" and "113dfd363026c6b6312ca7cd657c655fa3e854e8" have entirely different histories.
b76a6707f2
...
113dfd3630
|
@ -11,7 +11,7 @@ re_link_in_text = re.compile(r"\[\[[^]]+?\]\]", re.I | re.S)
|
||||||
|
|
||||||
|
|
||||||
class LinkReplace(Exception):
|
class LinkReplace(Exception):
|
||||||
"""Replaces and existing link."""
|
pass
|
||||||
|
|
||||||
|
|
||||||
en_dash = "\u2013"
|
en_dash = "\u2013"
|
||||||
|
@ -23,7 +23,7 @@ trans2[en_dash] = trans2[" "]
|
||||||
|
|
||||||
patterns = [
|
patterns = [
|
||||||
lambda q: re.compile(
|
lambda q: re.compile(
|
||||||
r"(?<!-)\[\[(%s)%s\|(?=.*\]\])"
|
r"(?<!-)(?:\[\[(?:[^]]+\|)?)?(%s)%s(?:\]\])?"
|
||||||
% (
|
% (
|
||||||
re.escape(q[0]),
|
re.escape(q[0]),
|
||||||
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
|
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
|
||||||
|
@ -31,19 +31,10 @@ patterns = [
|
||||||
re.I,
|
re.I,
|
||||||
),
|
),
|
||||||
lambda q: re.compile(
|
lambda q: re.compile(
|
||||||
r"(?<!-)\[\[(?:(?!File:)(?:[^]]+\|)?)(%s)%s\]\]"
|
r"(?<!-)\[\[[^|]+\|(%s)%s\]\]" % (re.escape(q[0]), re.escape(q[1:])), re.I
|
||||||
% (
|
|
||||||
re.escape(q[0]),
|
|
||||||
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
|
|
||||||
),
|
|
||||||
re.I,
|
|
||||||
),
|
),
|
||||||
lambda q: re.compile(
|
lambda q: re.compile(
|
||||||
r"(?<!-)\[\[(?!File:)[^|]+\|(%s)%s\]\]" % (re.escape(q[0]), re.escape(q[1:])),
|
r"(?<!-)\[\[[^|]+\|(%s)%s(?:\]\])?"
|
||||||
re.I,
|
|
||||||
),
|
|
||||||
lambda q: re.compile(
|
|
||||||
r"(?<!-)\[\[(?!File:)[^|]+\|(%s)%s(?:\]\])?"
|
|
||||||
% (
|
% (
|
||||||
re.escape(q[0]),
|
re.escape(q[0]),
|
||||||
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
|
"".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
|
||||||
|
@ -55,7 +46,7 @@ patterns = [
|
||||||
r"(?<!-)(%s)%s"
|
r"(?<!-)(%s)%s"
|
||||||
% (
|
% (
|
||||||
re.escape(q[0]),
|
re.escape(q[0]),
|
||||||
"".join((trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
|
"".join((trans[c] if c in trans else re.escape(c)) for c in q[1:]),
|
||||||
),
|
),
|
||||||
re.I,
|
re.I,
|
||||||
),
|
),
|
||||||
|
@ -63,13 +54,11 @@ patterns = [
|
||||||
|
|
||||||
|
|
||||||
class NoMatch(Exception):
|
class NoMatch(Exception):
|
||||||
"""No match."""
|
pass
|
||||||
|
|
||||||
|
|
||||||
re_cite = re.compile(
|
re_cite = re.compile(
|
||||||
# r"<ref( [^>]*?)?>\s*({{cite.*?}}|\[https?://[^]]*?\])\s*</ref>", re.I | re.S
|
r"<ref( [^>]*?)?>\s*({{cite.*?}}|\[https?://[^]]*?\])\s*</ref>", re.I | re.S
|
||||||
r"<ref( [^>]*?)?>.*</ref>",
|
|
||||||
re.I | re.S,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -109,7 +98,7 @@ def section_iter(text: str) -> typing.Iterator[tuple[str | None, str]]:
|
||||||
|
|
||||||
|
|
||||||
def get_subsections(text: str, section_num: int) -> str:
|
def get_subsections(text: str, section_num: int) -> str:
|
||||||
"""Retrieve the text of subsections for a given section number within an article."""
|
"retrieve the text of subsections for a given section number within an article"
|
||||||
found = ""
|
found = ""
|
||||||
collection_level = None
|
collection_level = None
|
||||||
for num, (heading, body) in enumerate(section_iter(text)):
|
for num, (heading, body) in enumerate(section_iter(text)):
|
||||||
|
@ -131,7 +120,7 @@ def get_subsections(text: str, section_num: int) -> str:
|
||||||
return found
|
return found
|
||||||
|
|
||||||
|
|
||||||
def match_found(m: re.Match[str], q: str, linkto: str | None) -> str:
|
def match_found(m, q, linkto):
|
||||||
if q[1:] == m.group(0)[1:]:
|
if q[1:] == m.group(0)[1:]:
|
||||||
replacement = m.group(1) + q[1:]
|
replacement = m.group(1) + q[1:]
|
||||||
elif any(c.isupper() for c in q[1:]) or m.group(0) == m.group(0).upper():
|
elif any(c.isupper() for c in q[1:]) or m.group(0) == m.group(0).upper():
|
||||||
|
@ -170,34 +159,23 @@ def parse_links(text: str) -> typing.Iterator[tuple[str, str]]:
|
||||||
yield ("text", text[prev:])
|
yield ("text", text[prev:])
|
||||||
|
|
||||||
|
|
||||||
def mk_link_matcher(q: str) -> typing.Callable[[str], re.Match[str] | None]:
|
def mk_link_matcher(q):
|
||||||
"""Make link matcher."""
|
|
||||||
re_links = [p(q) for p in patterns]
|
re_links = [p(q) for p in patterns]
|
||||||
|
|
||||||
def search_for_link(text: str) -> re.Match[str] | None:
|
def search_for_link(text):
|
||||||
for re_link in re_links:
|
for re_link in re_links:
|
||||||
m = re_link.search(text)
|
m = re_link.search(text)
|
||||||
if m and m.group(0).count("[[") < 4:
|
if m and m.group(0).count("[[") < 4:
|
||||||
return m
|
return m
|
||||||
return None
|
|
||||||
|
|
||||||
return search_for_link
|
return search_for_link
|
||||||
|
|
||||||
|
|
||||||
def add_link(m: re.Match[str], replacement: str, text: str) -> str:
|
def add_link(m, replacement, text):
|
||||||
"""Add link to text."""
|
return m.re.sub(lambda m: "[[%s]]" % replacement, text, count=1)
|
||||||
|
|
||||||
matched_text = m.group(0)
|
|
||||||
if matched_text.startswith("[[") and matched_text.endswith("|"):
|
|
||||||
return m.re.sub(lambda m: f"[[{replacement}|", text, count=1)
|
|
||||||
else:
|
|
||||||
return m.re.sub(lambda m: f"[[{replacement}]]", text, count=1)
|
|
||||||
|
|
||||||
|
|
||||||
def find_link_in_chunk(
|
def find_link_in_chunk(q, content, linkto=None):
|
||||||
q: str, content: str, linkto: str | None = None
|
|
||||||
) -> tuple[str, str | None, str | None]:
|
|
||||||
"""Find link in chunk."""
|
|
||||||
search_for_link = mk_link_matcher(q)
|
search_for_link = mk_link_matcher(q)
|
||||||
new_content = ""
|
new_content = ""
|
||||||
replacement = None
|
replacement = None
|
||||||
|
@ -267,7 +245,7 @@ def find_link_in_chunk(
|
||||||
return (new_content, replacement, found_text_to_link)
|
return (new_content, replacement, found_text_to_link)
|
||||||
|
|
||||||
|
|
||||||
def find_link_in_text(q: str, content: str) -> tuple[str, str]:
|
def find_link_in_text(q, content):
|
||||||
(new_content, replacement) = find_link_in_chunk(q, content)
|
(new_content, replacement) = find_link_in_chunk(q, content)
|
||||||
if replacement:
|
if replacement:
|
||||||
return (new_content, replacement)
|
return (new_content, replacement)
|
||||||
|
@ -302,7 +280,7 @@ def find_link_in_content(q, content, linkto=None):
|
||||||
raise LinkReplace if link_replace else NoMatch
|
raise LinkReplace if link_replace else NoMatch
|
||||||
|
|
||||||
|
|
||||||
def find_link_and_section(q: str, content: str, linkto: str | None = None):
|
def find_link_and_section(q, content, linkto=None):
|
||||||
if linkto:
|
if linkto:
|
||||||
try:
|
try:
|
||||||
return find_link_and_section(linkto, content)
|
return find_link_and_section(linkto, content)
|
||||||
|
@ -320,9 +298,7 @@ def find_link_and_section(q: str, content: str, linkto: str | None = None):
|
||||||
if header:
|
if header:
|
||||||
new_content += header
|
new_content += header
|
||||||
for token_type, text in parse_cite(section_text):
|
for token_type, text in parse_cite(section_text):
|
||||||
if token_type != "text" or replacement:
|
if token_type == "text" and not replacement:
|
||||||
new_content += text
|
|
||||||
continue
|
|
||||||
new_text = ""
|
new_text = ""
|
||||||
for token_type2, text2 in parse_links(text):
|
for token_type2, text2 in parse_links(text):
|
||||||
if token_type2 == "link" and not replacement:
|
if token_type2 == "link" and not replacement:
|
||||||
|
@ -362,7 +338,9 @@ def find_link_and_section(q: str, content: str, linkto: str | None = None):
|
||||||
|
|
||||||
def find_refs(text: str) -> list[str]:
|
def find_refs(text: str) -> list[str]:
|
||||||
"""Find <ref> in wikitext."""
|
"""Find <ref> in wikitext."""
|
||||||
|
|
||||||
refs = re.findall("<ref(?:[^>]*)>(.+?)</ref>", text)
|
refs = re.findall("<ref(?:[^>]*)>(.+?)</ref>", text)
|
||||||
|
print(refs)
|
||||||
return refs
|
return refs
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -47,7 +47,6 @@ def search_count_with_link(q: str) -> int:
|
||||||
|
|
||||||
|
|
||||||
def parse_contribs() -> list[tuple[str, int]]:
|
def parse_contribs() -> list[tuple[str, int]]:
|
||||||
"""Parse user contributions."""
|
|
||||||
re_comment = re.compile(r"^link \[\[(.*)\]\] using")
|
re_comment = re.compile(r"^link \[\[(.*)\]\] using")
|
||||||
|
|
||||||
links: collections.Counter[str] = collections.Counter()
|
links: collections.Counter[str] = collections.Counter()
|
||||||
|
@ -71,10 +70,10 @@ def parse_contribs() -> list[tuple[str, int]]:
|
||||||
return links.most_common(200)
|
return links.most_common(200)
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
with open("examples") as f:
|
with open("examples") as f:
|
||||||
seen = {json.loads(line)["title"] for line in f}
|
seen = {json.loads(line)["title"] for line in f}
|
||||||
|
|
||||||
|
|
||||||
out = open("examples", "a")
|
out = open("examples", "a")
|
||||||
for from_title, num in parse_contribs():
|
for from_title, num in parse_contribs():
|
||||||
if from_title in seen:
|
if from_title in seen:
|
||||||
|
@ -96,6 +95,7 @@ def main() -> None:
|
||||||
|
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
|
|
||||||
count = search_count(from_title)
|
count = search_count(from_title)
|
||||||
count_with_link = search_count_with_link(from_title)
|
count_with_link = search_count_with_link(from_title)
|
||||||
ratio = float(count_with_link) / float(count)
|
ratio = float(count_with_link) / float(count)
|
||||||
|
@ -112,7 +112,3 @@ def main() -> None:
|
||||||
|
|
||||||
# ret = core.do_search(from_title)
|
# ret = core.do_search(from_title)
|
||||||
# print(ret)
|
# print(ret)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
|
@ -289,8 +289,8 @@ def get_best_hit(title: str, hits: list[Hit]) -> tuple[Hit, dict[str, typing.Any
|
||||||
for hit in hits:
|
for hit in hits:
|
||||||
if hit["title"].lower() == title.lower():
|
if hit["title"].lower() == title.lower():
|
||||||
continue
|
continue
|
||||||
# if match_type(title, hit["snippet"]) != "exact":
|
if match_type(title, hit["snippet"]) != "exact":
|
||||||
# continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
print(f'get diff: {hit["title"]}, {title}')
|
print(f'get diff: {hit["title"]}, {title}')
|
||||||
|
|
Loading…
Reference in a new issue