diff --git a/add_links/match.py b/add_links/match.py index dc3dabc..cc18fdc 100644 --- a/add_links/match.py +++ b/add_links/match.py @@ -11,7 +11,7 @@ re_link_in_text = re.compile(r"\[\[[^]]+?\]\]", re.I | re.S) class LinkReplace(Exception): - pass + """Replaces and existing link.""" en_dash = "\u2013" @@ -23,7 +23,7 @@ trans2[en_dash] = trans2[" "] patterns = [ lambda q: re.compile( - r"(?]*?)?>\s*({{cite.*?}}|\[https?://[^]]*?\])\s*", re.I | re.S + # r"]*?)?>\s*({{cite.*?}}|\[https?://[^]]*?\])\s*", re.I | re.S + r"]*?)?>.*", + re.I | re.S, ) @@ -98,7 +109,7 @@ def section_iter(text: str) -> typing.Iterator[tuple[str | None, str]]: def get_subsections(text: str, section_num: int) -> str: - "retrieve the text of subsections for a given section number within an article" + """Retrieve the text of subsections for a given section number within an article.""" found = "" collection_level = None for num, (heading, body) in enumerate(section_iter(text)): @@ -120,7 +131,7 @@ def get_subsections(text: str, section_num: int) -> str: return found -def match_found(m, q, linkto): +def match_found(m: re.Match[str], q: str, linkto: str | None) -> str: if q[1:] == m.group(0)[1:]: replacement = m.group(1) + q[1:] elif any(c.isupper() for c in q[1:]) or m.group(0) == m.group(0).upper(): @@ -159,23 +170,34 @@ def parse_links(text: str) -> typing.Iterator[tuple[str, str]]: yield ("text", text[prev:]) -def mk_link_matcher(q): +def mk_link_matcher(q: str) -> typing.Callable[[str], re.Match[str] | None]: + """Make link matcher.""" re_links = [p(q) for p in patterns] - def search_for_link(text): + def search_for_link(text: str) -> re.Match[str] | None: for re_link in re_links: m = re_link.search(text) if m and m.group(0).count("[[") < 4: return m + return None return search_for_link -def add_link(m, replacement, text): - return m.re.sub(lambda m: "[[%s]]" % replacement, text, count=1) +def add_link(m: re.Match[str], replacement: str, text: str) -> str: + """Add link to text.""" + + matched_text = m.group(0) + if matched_text.startswith("[[") and matched_text.endswith("|"): + return m.re.sub(lambda m: f"[[{replacement}|", text, count=1) + else: + return m.re.sub(lambda m: f"[[{replacement}]]", text, count=1) -def find_link_in_chunk(q, content, linkto=None): +def find_link_in_chunk( + q: str, content: str, linkto: str | None = None +) -> tuple[str, str | None, str | None]: + """Find link in chunk.""" search_for_link = mk_link_matcher(q) new_content = "" replacement = None @@ -245,7 +267,7 @@ def find_link_in_chunk(q, content, linkto=None): return (new_content, replacement, found_text_to_link) -def find_link_in_text(q, content): +def find_link_in_text(q: str, content: str) -> tuple[str, str]: (new_content, replacement) = find_link_in_chunk(q, content) if replacement: return (new_content, replacement) @@ -280,7 +302,7 @@ def find_link_in_content(q, content, linkto=None): raise LinkReplace if link_replace else NoMatch -def find_link_and_section(q, content, linkto=None): +def find_link_and_section(q: str, content: str, linkto: str | None = None): if linkto: try: return find_link_and_section(linkto, content) @@ -298,30 +320,32 @@ def find_link_and_section(q, content, linkto=None): if header: new_content += header for token_type, text in parse_cite(section_text): - if token_type == "text" and not replacement: - new_text = "" - for token_type2, text2 in parse_links(text): - if token_type2 == "link" and not replacement: - link_text = text2[2:-2] - if "|" in link_text: - link_dest, link_text = link_text.split("|", 1) - else: - link_dest = None - m = search_for_link(link_text) - if m: - if link_dest: - found["link_dest"] = link_dest - found["link_text"] = link_text - replacement = match_found(m, q, None) - text2 = add_link(m, replacement, link_text) - new_text += text2 - if replacement: - text = new_text - else: - m = search_for_link(text) + if token_type != "text" or replacement: + new_content += text + continue + new_text = "" + for token_type2, text2 in parse_links(text): + if token_type2 == "link" and not replacement: + link_text = text2[2:-2] + if "|" in link_text: + link_dest, link_text = link_text.split("|", 1) + else: + link_dest = None + m = search_for_link(link_text) if m: - replacement = match_found(m, q, linkto) - text = add_link(m, replacement, text) + if link_dest: + found["link_dest"] = link_dest + found["link_text"] = link_text + replacement = match_found(m, q, None) + text2 = add_link(m, replacement, link_text) + new_text += text2 + if replacement: + text = new_text + else: + m = search_for_link(text) + if m: + replacement = match_found(m, q, linkto) + text = add_link(m, replacement, text) new_content += text if replacement: found.update( @@ -338,9 +362,7 @@ def find_link_and_section(q, content, linkto=None): def find_refs(text: str) -> list[str]: """Find in wikitext.""" - refs = re.findall("]*)>(.+?)", text) - print(refs) return refs diff --git a/cmdline.py b/cmdline.py index 1d1de7c..1b2edb1 100755 --- a/cmdline.py +++ b/cmdline.py @@ -47,6 +47,7 @@ def search_count_with_link(q: str) -> int: def parse_contribs() -> list[tuple[str, int]]: + """Parse user contributions.""" re_comment = re.compile(r"^link \[\[(.*)\]\] using") links: collections.Counter[str] = collections.Counter() @@ -70,45 +71,48 @@ def parse_contribs() -> list[tuple[str, int]]: return links.most_common(200) -with open("examples") as f: - seen = {json.loads(line)["title"] for line in f} +def main() -> None: + with open("examples") as f: + seen = {json.loads(line)["title"] for line in f} + out = open("examples", "a") + for from_title, num in parse_contribs(): + if from_title in seen: + continue + count = search_count(from_title) + count_with_link = search_count_with_link(from_title) + ratio = float(count_with_link) / float(count) + + print(from_title, count, count_with_link, f"{ratio:.1%}") + print( + json.dumps( + {"title": from_title, "total": count, "with_links": count_with_link} + ), + file=out, + ) + out.flush() + time.sleep(0.1) + out.close() + + sys.exit(0) -out = open("examples", "a") -for from_title, num in parse_contribs(): - if from_title in seen: - continue count = search_count(from_title) count_with_link = search_count_with_link(from_title) ratio = float(count_with_link) / float(count) - print(from_title, count, count_with_link, f"{ratio:.1%}") - print( - json.dumps( - {"title": from_title, "total": count, "with_links": count_with_link} - ), - file=out, - ) - out.flush() - time.sleep(0.1) -out.close() + print(count, count_with_link, f"{ratio:.1%}") -sys.exit(0) + sys.exit(0) + + totalhits, search_hits = search_no_link(from_title) + + for hit in search_hits: + print(" ", hit) + print(count, count_with_link, f"{ratio:.1%}", totalhits, len(search_hits)) + + # ret = core.do_search(from_title) + # print(ret) -count = search_count(from_title) -count_with_link = search_count_with_link(from_title) -ratio = float(count_with_link) / float(count) - -print(count, count_with_link, f"{ratio:.1%}") - -sys.exit(0) - -totalhits, search_hits = search_no_link(from_title) - -for hit in search_hits: - print(" ", hit) -print(count, count_with_link, f"{ratio:.1%}", totalhits, len(search_hits)) - -# ret = core.do_search(from_title) -# print(ret) +if __name__ == "__main__": + main() diff --git a/web_view.py b/web_view.py index 0f8a1c6..354ee95 100755 --- a/web_view.py +++ b/web_view.py @@ -289,8 +289,8 @@ def get_best_hit(title: str, hits: list[Hit]) -> tuple[Hit, dict[str, typing.Any for hit in hits: if hit["title"].lower() == title.lower(): continue - if match_type(title, hit["snippet"]) != "exact": - continue + # if match_type(title, hit["snippet"]) != "exact": + # continue try: print(f'get diff: {hit["title"]}, {title}')