Consider all matches, not just exact matches

Tidy code for building list of examples
Remove debugging output
2023-12-09 18:45:19 +00:00 · 2023-12-09 18:43:56 +00:00 · 2023-12-09 18:43:05 +00:00 · 2023-12-09 18:42:53 +00:00 · 2023-12-09 18:42:03 +00:00
3 changed files with 102 additions and 76 deletions
--- a/add_links/match.py
+++ b/add_links/match.py
@ -11,7 +11,7 @@ re_link_in_text = re.compile(r"\[\[[^]]+?\]\]", re.I | re.S)


 class LinkReplace(Exception):
-    pass
+    """Replaces and existing link."""


 en_dash = "\u2013"
@ -23,7 +23,7 @@ trans2[en_dash] = trans2[" "]

 patterns = [
    lambda q: re.compile(
-        r"(?<!-)(?:\[\[(?:[^]]+\|)?)?(%s)%s(?:\]\])?"
+        r"(?<!-)\[\[(%s)%s\|(?=.*\]\])"
        % (
            re.escape(q[0]),
            "".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
@ -31,10 +31,19 @@ patterns = [
        re.I,
    ),
    lambda q: re.compile(
-        r"(?<!-)\[\[[^|]+\|(%s)%s\]\]" % (re.escape(q[0]), re.escape(q[1:])), re.I
+        r"(?<!-)\[\[(?:(?!File:)(?:[^]]+\|)?)(%s)%s\]\]"
+        % (
+            re.escape(q[0]),
+            "".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
+        ),
+        re.I,
    ),
    lambda q: re.compile(
-        r"(?<!-)\[\[[^|]+\|(%s)%s(?:\]\])?"
+        r"(?<!-)\[\[(?!File:)[^|]+\|(%s)%s\]\]" % (re.escape(q[0]), re.escape(q[1:])),
+        re.I,
+    ),
+    lambda q: re.compile(
+        r"(?<!-)\[\[(?!File:)[^|]+\|(%s)%s(?:\]\])?"
        % (
            re.escape(q[0]),
            "".join("-?" + (trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
@ -46,7 +55,7 @@ patterns = [
        r"(?<!-)(%s)%s"
        % (
            re.escape(q[0]),
-            "".join((trans[c] if c in trans else re.escape(c)) for c in q[1:]),
+            "".join((trans2[c] if c in trans2 else re.escape(c)) for c in q[1:]),
        ),
        re.I,
    ),
@ -54,11 +63,13 @@ patterns = [


 class NoMatch(Exception):
-    pass
+    """No match."""


 re_cite = re.compile(
-    r"<ref( [^>]*?)?>\s*({{cite.*?}}|\[https?://[^]]*?\])\s*</ref>", re.I | re.S
+    # r"<ref( [^>]*?)?>\s*({{cite.*?}}|\[https?://[^]]*?\])\s*</ref>", re.I | re.S
+    r"<ref( [^>]*?)?>.*</ref>",
+    re.I | re.S,
 )


@ -98,7 +109,7 @@ def section_iter(text: str) -> typing.Iterator[tuple[str | None, str]]:


 def get_subsections(text: str, section_num: int) -> str:
-    "retrieve the text of subsections for a given section number within an article"
+    """Retrieve the text of subsections for a given section number within an article."""
    found = ""
    collection_level = None
    for num, (heading, body) in enumerate(section_iter(text)):
@ -120,7 +131,7 @@ def get_subsections(text: str, section_num: int) -> str:
    return found


-def match_found(m, q, linkto):
+def match_found(m: re.Match[str], q: str, linkto: str | None) -> str:
    if q[1:] == m.group(0)[1:]:
        replacement = m.group(1) + q[1:]
    elif any(c.isupper() for c in q[1:]) or m.group(0) == m.group(0).upper():
@ -159,23 +170,34 @@ def parse_links(text: str) -> typing.Iterator[tuple[str, str]]:
        yield ("text", text[prev:])


-def mk_link_matcher(q):
+def mk_link_matcher(q: str) -> typing.Callable[[str], re.Match[str] | None]:
+    """Make link matcher."""
    re_links = [p(q) for p in patterns]

-    def search_for_link(text):
+    def search_for_link(text: str) -> re.Match[str] | None:
        for re_link in re_links:
            m = re_link.search(text)
            if m and m.group(0).count("[[") < 4:
                return m
+        return None

    return search_for_link


-def add_link(m, replacement, text):
-    return m.re.sub(lambda m: "[[%s]]" % replacement, text, count=1)
+def add_link(m: re.Match[str], replacement: str, text: str) -> str:
+    """Add link to text."""
+
+    matched_text = m.group(0)
+    if matched_text.startswith("[[") and matched_text.endswith("|"):
+        return m.re.sub(lambda m: f"[[{replacement}|", text, count=1)
+    else:
+        return m.re.sub(lambda m: f"[[{replacement}]]", text, count=1)


-def find_link_in_chunk(q, content, linkto=None):
+def find_link_in_chunk(
+    q: str, content: str, linkto: str | None = None
+) -> tuple[str, str | None, str | None]:
+    """Find link in chunk."""
    search_for_link = mk_link_matcher(q)
    new_content = ""
    replacement = None
@ -245,7 +267,7 @@ def find_link_in_chunk(q, content, linkto=None):
    return (new_content, replacement, found_text_to_link)


-def find_link_in_text(q, content):
+def find_link_in_text(q: str, content: str) -> tuple[str, str]:
    (new_content, replacement) = find_link_in_chunk(q, content)
    if replacement:
        return (new_content, replacement)
@ -280,7 +302,7 @@ def find_link_in_content(q, content, linkto=None):
    raise LinkReplace if link_replace else NoMatch


-def find_link_and_section(q, content, linkto=None):
+def find_link_and_section(q: str, content: str, linkto: str | None = None):
    if linkto:
        try:
            return find_link_and_section(linkto, content)
@ -298,30 +320,32 @@ def find_link_and_section(q, content, linkto=None):
        if header:
            new_content += header
        for token_type, text in parse_cite(section_text):
-            if token_type == "text" and not replacement:
-                new_text = ""
-                for token_type2, text2 in parse_links(text):
-                    if token_type2 == "link" and not replacement:
-                        link_text = text2[2:-2]
-                        if "|" in link_text:
-                            link_dest, link_text = link_text.split("|", 1)
-                        else:
-                            link_dest = None
-                        m = search_for_link(link_text)
-                        if m:
-                            if link_dest:
-                                found["link_dest"] = link_dest
-                            found["link_text"] = link_text
-                            replacement = match_found(m, q, None)
-                            text2 = add_link(m, replacement, link_text)
-                    new_text += text2
-                if replacement:
-                    text = new_text
-                else:
-                    m = search_for_link(text)
+            if token_type != "text" or replacement:
+                new_content += text
+                continue
+            new_text = ""
+            for token_type2, text2 in parse_links(text):
+                if token_type2 == "link" and not replacement:
+                    link_text = text2[2:-2]
+                    if "|" in link_text:
+                        link_dest, link_text = link_text.split("|", 1)
+                    else:
+                        link_dest = None
+                    m = search_for_link(link_text)
                    if m:
-                        replacement = match_found(m, q, linkto)
-                        text = add_link(m, replacement, text)
+                        if link_dest:
+                            found["link_dest"] = link_dest
+                        found["link_text"] = link_text
+                        replacement = match_found(m, q, None)
+                        text2 = add_link(m, replacement, link_text)
+                new_text += text2
+            if replacement:
+                text = new_text
+            else:
+                m = search_for_link(text)
+                if m:
+                    replacement = match_found(m, q, linkto)
+                    text = add_link(m, replacement, text)
            new_content += text
        if replacement:
            found.update(
@ -338,9 +362,7 @@ def find_link_and_section(q, content, linkto=None):

 def find_refs(text: str) -> list[str]:
    """Find <ref> in wikitext."""
-
    refs = re.findall("<ref(?:[^>]*)>(.+?)</ref>", text)
-    print(refs)
    return refs


--- a/cmdline.py
+++ b/cmdline.py
@ -47,6 +47,7 @@ def search_count_with_link(q: str) -> int:


 def parse_contribs() -> list[tuple[str, int]]:
+    """Parse user contributions."""
    re_comment = re.compile(r"^link \[\[(.*)\]\] using")

    links: collections.Counter[str] = collections.Counter()
@ -70,45 +71,48 @@ def parse_contribs() -> list[tuple[str, int]]:
    return links.most_common(200)


-with open("examples") as f:
-    seen = {json.loads(line)["title"] for line in f}
+def main() -> None:
+    with open("examples") as f:
+        seen = {json.loads(line)["title"] for line in f}

+    out = open("examples", "a")
+    for from_title, num in parse_contribs():
+        if from_title in seen:
+            continue
+        count = search_count(from_title)
+        count_with_link = search_count_with_link(from_title)
+        ratio = float(count_with_link) / float(count)
+
+        print(from_title, count, count_with_link, f"{ratio:.1%}")
+        print(
+            json.dumps(
+                {"title": from_title, "total": count, "with_links": count_with_link}
+            ),
+            file=out,
+        )
+        out.flush()
+        time.sleep(0.1)
+    out.close()
+
+    sys.exit(0)

-out = open("examples", "a")
-for from_title, num in parse_contribs():
-    if from_title in seen:
-        continue
    count = search_count(from_title)
    count_with_link = search_count_with_link(from_title)
    ratio = float(count_with_link) / float(count)

-    print(from_title, count, count_with_link, f"{ratio:.1%}")
-    print(
-        json.dumps(
-            {"title": from_title, "total": count, "with_links": count_with_link}
-        ),
-        file=out,
-    )
-    out.flush()
-    time.sleep(0.1)
-out.close()
+    print(count, count_with_link, f"{ratio:.1%}")

-sys.exit(0)
+    sys.exit(0)
+
+    totalhits, search_hits = search_no_link(from_title)
+
+    for hit in search_hits:
+        print("  ", hit)
+    print(count, count_with_link, f"{ratio:.1%}", totalhits, len(search_hits))
+
+    # ret = core.do_search(from_title)
+    # print(ret)


-count = search_count(from_title)
-count_with_link = search_count_with_link(from_title)
-ratio = float(count_with_link) / float(count)
-
-print(count, count_with_link, f"{ratio:.1%}")
-
-sys.exit(0)
-
-totalhits, search_hits = search_no_link(from_title)
-
-for hit in search_hits:
-    print("  ", hit)
-print(count, count_with_link, f"{ratio:.1%}", totalhits, len(search_hits))
-
-# ret = core.do_search(from_title)
-# print(ret)
+if __name__ == "__main__":
+    main()
--- a/web_view.py
+++ b/web_view.py
@ -289,8 +289,8 @@ def get_best_hit(title: str, hits: list[Hit]) -> tuple[Hit, dict[str, typing.Any
    for hit in hits:
        if hit["title"].lower() == title.lower():
            continue
-        if match_type(title, hit["snippet"]) != "exact":
-            continue
+        # if match_type(title, hit["snippet"]) != "exact":
+        #     continue

        try:
            print(f'get diff: {hit["title"]}, {title}')
Author	SHA1	Message	Date
Edward Betts	b76a6707f2	Consider all matches, not just exact matches	2023-12-09 18:45:19 +00:00
Edward Betts	ba56274022	Tidy code for building list of examples	2023-12-09 18:43:56 +00:00
Edward Betts	479dc864fd	Remove debugging output	2023-12-09 18:43:05 +00:00
Edward Betts	14d8539298	Link matching improvements	2023-12-09 18:42:53 +00:00
Edward Betts	1da620875a	Add type hints and docstrings	2023-12-09 18:42:03 +00:00