diff --git a/add_links/match.py b/add_links/match.py index cc18fdc..dc3dabc 100644 --- a/add_links/match.py +++ b/add_links/match.py @@ -11,7 +11,7 @@ re_link_in_text = re.compile(r"\[\[[^]]+?\]\]", re.I | re.S) class LinkReplace(Exception): - """Replaces and existing link.""" + pass en_dash = "\u2013" @@ -23,7 +23,7 @@ trans2[en_dash] = trans2[" "] patterns = [ lambda q: re.compile( - r"(?]*?)?>\s*({{cite.*?}}|\[https?://[^]]*?\])\s*", re.I | re.S - r"]*?)?>.*", - re.I | re.S, + r"]*?)?>\s*({{cite.*?}}|\[https?://[^]]*?\])\s*", re.I | re.S ) @@ -109,7 +98,7 @@ def section_iter(text: str) -> typing.Iterator[tuple[str | None, str]]: def get_subsections(text: str, section_num: int) -> str: - """Retrieve the text of subsections for a given section number within an article.""" + "retrieve the text of subsections for a given section number within an article" found = "" collection_level = None for num, (heading, body) in enumerate(section_iter(text)): @@ -131,7 +120,7 @@ def get_subsections(text: str, section_num: int) -> str: return found -def match_found(m: re.Match[str], q: str, linkto: str | None) -> str: +def match_found(m, q, linkto): if q[1:] == m.group(0)[1:]: replacement = m.group(1) + q[1:] elif any(c.isupper() for c in q[1:]) or m.group(0) == m.group(0).upper(): @@ -170,34 +159,23 @@ def parse_links(text: str) -> typing.Iterator[tuple[str, str]]: yield ("text", text[prev:]) -def mk_link_matcher(q: str) -> typing.Callable[[str], re.Match[str] | None]: - """Make link matcher.""" +def mk_link_matcher(q): re_links = [p(q) for p in patterns] - def search_for_link(text: str) -> re.Match[str] | None: + def search_for_link(text): for re_link in re_links: m = re_link.search(text) if m and m.group(0).count("[[") < 4: return m - return None return search_for_link -def add_link(m: re.Match[str], replacement: str, text: str) -> str: - """Add link to text.""" - - matched_text = m.group(0) - if matched_text.startswith("[[") and matched_text.endswith("|"): - return m.re.sub(lambda m: f"[[{replacement}|", text, count=1) - else: - return m.re.sub(lambda m: f"[[{replacement}]]", text, count=1) +def add_link(m, replacement, text): + return m.re.sub(lambda m: "[[%s]]" % replacement, text, count=1) -def find_link_in_chunk( - q: str, content: str, linkto: str | None = None -) -> tuple[str, str | None, str | None]: - """Find link in chunk.""" +def find_link_in_chunk(q, content, linkto=None): search_for_link = mk_link_matcher(q) new_content = "" replacement = None @@ -267,7 +245,7 @@ def find_link_in_chunk( return (new_content, replacement, found_text_to_link) -def find_link_in_text(q: str, content: str) -> tuple[str, str]: +def find_link_in_text(q, content): (new_content, replacement) = find_link_in_chunk(q, content) if replacement: return (new_content, replacement) @@ -302,7 +280,7 @@ def find_link_in_content(q, content, linkto=None): raise LinkReplace if link_replace else NoMatch -def find_link_and_section(q: str, content: str, linkto: str | None = None): +def find_link_and_section(q, content, linkto=None): if linkto: try: return find_link_and_section(linkto, content) @@ -320,32 +298,30 @@ def find_link_and_section(q: str, content: str, linkto: str | None = None): if header: new_content += header for token_type, text in parse_cite(section_text): - if token_type != "text" or replacement: - new_content += text - continue - new_text = "" - for token_type2, text2 in parse_links(text): - if token_type2 == "link" and not replacement: - link_text = text2[2:-2] - if "|" in link_text: - link_dest, link_text = link_text.split("|", 1) - else: - link_dest = None - m = search_for_link(link_text) + if token_type == "text" and not replacement: + new_text = "" + for token_type2, text2 in parse_links(text): + if token_type2 == "link" and not replacement: + link_text = text2[2:-2] + if "|" in link_text: + link_dest, link_text = link_text.split("|", 1) + else: + link_dest = None + m = search_for_link(link_text) + if m: + if link_dest: + found["link_dest"] = link_dest + found["link_text"] = link_text + replacement = match_found(m, q, None) + text2 = add_link(m, replacement, link_text) + new_text += text2 + if replacement: + text = new_text + else: + m = search_for_link(text) if m: - if link_dest: - found["link_dest"] = link_dest - found["link_text"] = link_text - replacement = match_found(m, q, None) - text2 = add_link(m, replacement, link_text) - new_text += text2 - if replacement: - text = new_text - else: - m = search_for_link(text) - if m: - replacement = match_found(m, q, linkto) - text = add_link(m, replacement, text) + replacement = match_found(m, q, linkto) + text = add_link(m, replacement, text) new_content += text if replacement: found.update( @@ -362,7 +338,9 @@ def find_link_and_section(q: str, content: str, linkto: str | None = None): def find_refs(text: str) -> list[str]: """Find in wikitext.""" + refs = re.findall("]*)>(.+?)", text) + print(refs) return refs diff --git a/cmdline.py b/cmdline.py index 1b2edb1..1d1de7c 100755 --- a/cmdline.py +++ b/cmdline.py @@ -47,7 +47,6 @@ def search_count_with_link(q: str) -> int: def parse_contribs() -> list[tuple[str, int]]: - """Parse user contributions.""" re_comment = re.compile(r"^link \[\[(.*)\]\] using") links: collections.Counter[str] = collections.Counter() @@ -71,48 +70,45 @@ def parse_contribs() -> list[tuple[str, int]]: return links.most_common(200) -def main() -> None: - with open("examples") as f: - seen = {json.loads(line)["title"] for line in f} +with open("examples") as f: + seen = {json.loads(line)["title"] for line in f} - out = open("examples", "a") - for from_title, num in parse_contribs(): - if from_title in seen: - continue - count = search_count(from_title) - count_with_link = search_count_with_link(from_title) - ratio = float(count_with_link) / float(count) - - print(from_title, count, count_with_link, f"{ratio:.1%}") - print( - json.dumps( - {"title": from_title, "total": count, "with_links": count_with_link} - ), - file=out, - ) - out.flush() - time.sleep(0.1) - out.close() - - sys.exit(0) +out = open("examples", "a") +for from_title, num in parse_contribs(): + if from_title in seen: + continue count = search_count(from_title) count_with_link = search_count_with_link(from_title) ratio = float(count_with_link) / float(count) - print(count, count_with_link, f"{ratio:.1%}") + print(from_title, count, count_with_link, f"{ratio:.1%}") + print( + json.dumps( + {"title": from_title, "total": count, "with_links": count_with_link} + ), + file=out, + ) + out.flush() + time.sleep(0.1) +out.close() - sys.exit(0) - - totalhits, search_hits = search_no_link(from_title) - - for hit in search_hits: - print(" ", hit) - print(count, count_with_link, f"{ratio:.1%}", totalhits, len(search_hits)) - - # ret = core.do_search(from_title) - # print(ret) +sys.exit(0) -if __name__ == "__main__": - main() +count = search_count(from_title) +count_with_link = search_count_with_link(from_title) +ratio = float(count_with_link) / float(count) + +print(count, count_with_link, f"{ratio:.1%}") + +sys.exit(0) + +totalhits, search_hits = search_no_link(from_title) + +for hit in search_hits: + print(" ", hit) +print(count, count_with_link, f"{ratio:.1%}", totalhits, len(search_hits)) + +# ret = core.do_search(from_title) +# print(ret) diff --git a/web_view.py b/web_view.py index 354ee95..0f8a1c6 100755 --- a/web_view.py +++ b/web_view.py @@ -289,8 +289,8 @@ def get_best_hit(title: str, hits: list[Hit]) -> tuple[Hit, dict[str, typing.Any for hit in hits: if hit["title"].lower() == title.lower(): continue - # if match_type(title, hit["snippet"]) != "exact": - # continue + if match_type(title, hit["snippet"]) != "exact": + continue try: print(f'get diff: {hit["title"]}, {title}')