Initial commit

2023-10-04 12:56:21 +01:00 · 2023-10-04 12:56:21 +01:00 · f07b407e7a
commit f07b407e7a
25 changed files with 2383 additions and 0 deletions
--- a/add_links/core.py
+++ b/add_links/core.py
@ -0,0 +1,198 @@
+"""Core functions."""
+
+import html
+import re
+import typing
+from pprint import pprint
+
+from .api import (
+    MediawikiError,
+    all_pages,
+    cat_start,
+    categorymembers,
+    find_disambig,
+    get_first_page,
+    wiki_backlink,
+    wiki_search,
+)
+from .util import case_flip_first, norm
+
+re_redirect = re.compile(r"#REDIRECT \[\[(.)([^#]*?)(#.*)?\]\]")
+
+
+def get_content_and_timestamp(title: str) -> tuple[str, str]:
+    """Get article content and timestamp of last update."""
+    params = {
+        "prop": "revisions|info",
+        "rvprop": "content|timestamp",
+        "titles": title,
+    }
+    json_data: dict[str, typing.Any] = get_first_page(params)
+    if json_data.get("invalid"):
+        raise MediawikiError(json_data["invalidreason"])
+    rev = json_data["revisions"][0]
+    return (rev["content"], rev["timestamp"])
+
+
+def get_revision_info(title: str) -> dict[str, typing.Any]:
+    """Get info about latest revision of article."""
+    params = {
+        "prop": "revisions|info",
+        "rvprop": "content|timestamp|ids",
+        "titles": title,
+    }
+    json_data: dict[str, typing.Any] = get_first_page(params)
+    if json_data.get("invalid"):
+        raise MediawikiError(json_data["invalidreason"])
+    revs = json_data.pop("revisions")
+    ret = revs[0]
+    ret["pageid"] = json_data["pageid"]
+    pprint(json_data)
+    return typing.cast(dict[str, typing.Any], ret)
+
+
+def is_redirect_to(title_from: str, title_to: str) -> bool:
+    title_from = title_from.replace("_", " ")
+    params = {"prop": "info", "titles": title_from}
+    if "redirect" not in get_first_page(params):
+        return False
+
+    params = {"prop": "revisions", "rvprop": "content", "titles": title_from}
+    page_text = get_first_page(params)["revisions"][0]["content"]
+    m = re_redirect.match(page_text)
+    assert m
+    title_to = title_to[0].upper() + title_to[1:]
+    return m.group(1).upper() + m.group(2) == title_to
+
+
+def find_longer(
+    q: str, search: list[dict[str, typing.Any]], articles: set[str]
+) -> list[str]:
+    """Find other articles with titles that are longer."""
+    this_title = q[0].upper() + q[1:]
+    longer: list[str] = all_pages(this_title)
+    lq = q.lower()
+    for doc in search:
+        lt = doc["title"].lower()
+        if lq == lt or lq not in lt:
+            continue
+        articles.add(doc["title"])
+        more_articles, more_redirects = wiki_backlink(doc["title"])
+        articles.update(more_articles)
+        if doc["title"] not in longer:
+            longer.append(doc["title"])
+
+    return longer
+
+
+def tidy_snippet(snippet: str) -> str:
+    """Remove HTML from snippet."""
+    snippet = snippet.replace("\u2013", "-")
+    snippet = snippet.replace("</span>", "")
+    snippet = snippet.replace('<span class="searchmatch">', "")
+    return html.unescape(snippet)
+
+
+def match_type(q: str, snippet: str) -> str | None:
+    """Discover match type, ''exact', 'case_mismatch' or None.
+
+    >>> match_type('foo', 'foo')
+    'exact'
+    >>> match_type('foo', 'bar') is None
+    True
+    >>> match_type('bar', 'foo bar baz')
+    'exact'
+    >>> match_type('clean coal technology', 'foo clean coal technologies baz')
+    'exact'
+    >>> match_type('bar', 'foo Bar baz')
+    'exact'
+    >>> match_type('bar', 'foo BAR baz')
+    'case_mismatch'
+    >>> match_type('foo-bar', 'aa foo-bar cc')
+    'exact'
+    >>> match_type(u'foo\u2013bar', 'aa foo-bar cc')
+    'exact'
+    """
+    q = q.replace("\u2013", "-")
+    snippet = tidy_snippet(snippet)
+
+    if q in snippet or case_flip_first(q) in snippet:
+        return "exact"
+    match = None
+    if q.lower() in snippet.lower():
+        match = "case_mismatch"
+    if match != "exact" and q.endswith("y"):
+        if q[:-1] in snippet or case_flip_first(q[:-1]) in snippet:
+            return "exact"
+    elif match is None:
+        if q[:-1].lower() in snippet.lower():
+            match = "case_mismatch"
+    return match
+
+
+def do_search(
+    q: str, redirect_to: str | None = None
+) -> dict[str, int | list[dict[str, typing.Any]] | list[str] | None]:
+    this_title = q[0].upper() + q[1:]
+
+    totalhits, search_hits = wiki_search(q)
+    articles, redirects = wiki_backlink(redirect_to or q)
+    cm = set()
+    start = cat_start(q)
+    if len(start) > 5:
+        start = []  # big categories take too long
+    for cat in set(["Category:" + this_title] + start):
+        cm.update(categorymembers(cat))
+
+    norm_q = norm(q)
+    norm_match_redirect = {r for r in redirects if norm(r) == norm_q}
+    longer_redirect = {r for r in redirects if q.lower() in r.lower()}
+
+    articles.add(this_title)
+    if redirect_to:
+        articles.add(redirect_to[0].upper() + redirect_to[1:])
+
+    longer_redirect = {r for r in redirects if q.lower() in r.lower()}
+    for r in norm_match_redirect | longer_redirect:
+        articles.add(r)
+        a2, r2 = wiki_backlink(r)
+        articles.update(a2)
+        redirects.update(r2)
+
+    longer = find_longer(q, search_hits, articles) if len(q) > 6 else None
+
+    search: list[dict[str, typing.Any]] = [
+        doc
+        for doc in search_hits
+        if doc["title"] not in articles and doc["title"] not in cm
+    ]
+    if search:
+        disambig = set(find_disambig([doc["title"] for doc in search]))
+        search = [doc for doc in search if doc["title"] not in disambig]
+        # and (doc['title'] not in links or this_title not in links[doc['title']])]
+        for doc in search:
+            without_markup = (
+                doc["snippet"]
+                .replace("<span class='searchmatch'>", "")
+                .replace("</span>", "")
+                .replace("  ", " ")
+            )
+            doc["match"] = match_type(q, without_markup)
+            doc["snippet_without_markup"] = without_markup
+    return {
+        "totalhits": totalhits,
+        "results": search,
+        "longer": longer,
+    }
+
+
+def get_case_from_content(title: str) -> str | None:
+    """Check article content to find the case of the article title."""
+    content, timestamp = get_content_and_timestamp(title)
+    if title == title.lower() and title in content:
+        return title
+    start = content.lower().find("'''" + title.replace("_", " ").lower() + "'''")
+    if start != -1:
+        return content[start + 3 : start + 3 + len(title)]
+
+    return None  # article doesn't contain the title