From 43a3cd566c9c37b5b56a7cd919992ee681723eda Mon Sep 17 00:00:00 2001
From: Edward Betts <edward@4angle.com>
Date: Mon, 16 Sep 2019 09:07:09 +0100
Subject: [PATCH] Add missing module

---
 depicts/mediawiki.py | 124 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 depicts/mediawiki.py

diff --git a/depicts/mediawiki.py b/depicts/mediawiki.py
new file mode 100644
index 0000000..a8152b6
--- /dev/null
+++ b/depicts/mediawiki.py
@@ -0,0 +1,124 @@
+import requests
+import os
+import json
+from .category import Category
+
+wikidata_url = 'https://www.wikidata.org/w/api.php'
+
+hosts = {
+    'commons': 'commons.wikimedia.org',
+    'enwiki': 'en.wikipedia.org',
+    'wikidata': 'www.wikidata.org',
+}
+
+def api_call(params, api_url=wikidata_url):
+    call_params = {
+        'format': 'json',
+        'formatversion': 2,
+        **params,
+    }
+
+    r = requests.get(api_url, params=call_params)
+    return r
+
+def get_entity(qid):
+    json_data = api_call({'action': 'wbgetentities', 'ids': qid}).json()
+
+    try:
+        entity = list(json_data['entities'].values())[0]
+    except KeyError:
+        return
+    if 'missing' not in entity:
+        return entity
+
+def get_entities(ids, **params):
+    if not ids:
+        return []
+    params = {
+        'action': 'wbgetentities',
+        'ids': '|'.join(ids),
+        **params,
+    }
+    r = api_call(params)
+    json_data = r.json()
+    return list(json_data['entities'].values())
+
+def get_entity_with_cache(qid):
+    filename = f'cache/{qid}.json'
+    if os.path.exists(filename):
+        entity = json.load(open(filename))
+    else:
+        entity = get_entity(qid)
+        json.dump(entity, open(filename, 'w'), indent=2)
+
+    return entity
+
+def mediawiki_query(titles, params, site):
+    if not titles:
+        return []
+
+    # avoid error: Too many values supplied for parameter "titles". The limit is 50.
+    if len(titles) > 50:
+        titles = titles[:50]
+    base = {
+        'format': 'json',
+        'formatversion': 2,
+        'action': 'query',
+        'continue': '',
+        'titles': '|'.join(titles),
+    }
+    p = base.copy()
+    p.update(params)
+
+    query_url = f'https://{hosts[site]}/w/api.php'
+    r = requests.get(query_url, params=p)
+    expect = 'application/json; charset=utf-8'
+    success = True
+    if r.status_code != 200:
+        print('status code: {r.status_code}'.format(r=r))
+        success = False
+    if r.headers['content-type'] != expect:
+        print('content-type: {r.headers[content-type]}'.format(r=r))
+        success = False
+    assert success
+    json_reply = r.json()
+    if 'query' not in json_reply:
+        print(r.url)
+        print(r.text)
+    return json_reply['query']['pages']
+
+def get_content_and_categories(title, site):
+    params = {
+        'prop': 'revisions|categories',
+        'clshow': '!hidden',
+        'cllimit': 'max',
+        'rvprop': 'content',
+    }
+
+    pages = mediawiki_query([title], params, site)
+    assert len(pages) == 1
+    page = pages[0]
+    return (page['revisions'][0]['content'], page.get('categories', []))
+
+def host_from_site(site):
+    return hosts[site]
+
+def process_cats(cats, site):
+    return [Category(cat['title'], site) for cat in cats]
+
+def get_categories(titles, site):
+    params = {
+        'prop': 'categories',
+        'clshow': '!hidden',
+        'cllimit': 'max',
+    }
+    from_wiki = mediawiki_query(titles, params, site)
+    title_and_cats = []
+    for i in from_wiki:
+        if 'categories' not in i:
+            continue
+        cats = process_cats(i['categories'], site)
+        if not cats:
+            continue
+        title_and_cats.append((i['title'], cats))
+    return title_and_cats