From 5941f28ebd584bfa2ffdeac8c174b4aae6ed0d34 Mon Sep 17 00:00:00 2001
From: Edward Betts <edward@4angle.com>
Date: Mon, 14 Oct 2019 10:45:48 +0100
Subject: [PATCH] Request 50 items at a time from wikibase

---
 depicts/human.py     | 31 ++++++++++++-------------
 depicts/mediawiki.py | 55 +++++++++++++++++++++++++++++---------------
 2 files changed, 51 insertions(+), 35 deletions(-)

diff --git a/depicts/human.py b/depicts/human.py
index 5826bc6..60544eb 100644
--- a/depicts/human.py
+++ b/depicts/human.py
@@ -1,5 +1,5 @@
 from .model import HumanItem
-from . import mediawiki, utils, wikibase
+from . import mediawiki, wikibase
 import re
 
 re_four_digits = re.compile(r'\b\d{4}\b')
@@ -38,20 +38,19 @@ def from_name(name):
     qids = list(lookup.keys())
 
     found = []
-    for cur in utils.chunk(qids, 50):
-        for entity in mediawiki.get_entities_with_cache(cur, props='labels|descriptions'):
-            qid = entity['id']
-            item = lookup[qid]
-            i = {
-                'qid': entity['id'],
-                'year_of_birth': item.year_of_birth,
-                'year_of_death': item.year_of_death,
-            }
-            label = wikibase.get_entity_label(entity)
-            if label:
-                i['label'] = label
-            if 'en' in entity['descriptions']:
-                i['description'] = entity['descriptions']['en']['value']
-            found.append(i)
+    for entity in mediawiki.get_entities_with_cache(qids, props='labels|descriptions'):
+        qid = entity['id']
+        item = lookup[qid]
+        i = {
+            'qid': entity['id'],
+            'year_of_birth': item.year_of_birth,
+            'year_of_death': item.year_of_death,
+        }
+        label = wikibase.get_entity_label(entity)
+        if label:
+            i['label'] = label
+        if 'en' in entity['descriptions']:
+            i['description'] = entity['descriptions']['en']['value']
+        found.append(i)
     found.sort(key=lambda i: i.get('label', ''))
     return found
diff --git a/depicts/mediawiki.py b/depicts/mediawiki.py
index ada56f3..588ea52 100644
--- a/depicts/mediawiki.py
+++ b/depicts/mediawiki.py
@@ -3,8 +3,10 @@ import os
 import json
 import hashlib
 from .category import Category
+from . import utils
 
 wikidata_url = 'https://www.wikidata.org/w/api.php'
+page_size = 50
 
 hosts = {
     'commons': 'commons.wikimedia.org',
@@ -32,19 +34,7 @@ def get_entity(qid):
     if 'missing' not in entity:
         return entity
 
-def get_entities(ids, **params):
-    if not ids:
-        return []
-    params = {
-        'action': 'wbgetentities',
-        'ids': '|'.join(ids),
-        **params,
-    }
-    r = api_call(params)
-    json_data = r.json()
-    return list(json_data['entities'].values())
-
-def get_entities_dict(ids, **params):
+def wbgetentities(ids, **params):
     if not ids:
         return []
     params = {
@@ -54,6 +44,18 @@ def get_entities_dict(ids, **params):
     }
     return api_call(params).json()['entities']
 
+def get_entities(ids, **params):
+    entity_list = []
+    for cur in utils.chunk(ids, page_size):
+        entity_list += wbgetentities(cur, **params).values()
+    return entity_list
+
+def get_entities_dict(ids, **params):
+    entities = {}
+    for cur in utils.chunk(ids, page_size):
+        entities.update(wbgetentities(cur, **params))
+    return entities
+
 def get_entity_with_cache(qid, refresh=False):
     filename = f'cache/{qid}.json'
     if not refresh and os.path.exists(filename):
@@ -69,20 +71,35 @@ def get_entities_with_cache(ids, **params):
 
     filename = f'cache/entities_{md5}.json'
     if os.path.exists(filename):
-        entity = json.load(open(filename))
+        entity_list = json.load(open(filename))
     else:
-        entity = get_entities(ids, **params)
-        json.dump(entity, open(filename, 'w'), indent=2)
+        entity_list = get_entities(ids, **params)
+        json.dump(entity_list, open(filename, 'w'), indent=2)
 
-    return entity
+    return entity_list
+
+def get_entities_dict_with_cache(all_ids, **params):
+    entities = {}
+    for ids in utils.chunk(all_ids, page_size):
+        md5 = hashlib.md5(' '.join(ids).encode('utf-8')).hexdigest()
+
+        filename = f'cache/entities_dict_{md5}.json'
+        if os.path.exists(filename):
+            entities.update(json.load(open(filename)))
+            continue
+        cur = wbgetentities(ids, **params)
+        json.dump(cur, open(filename, 'w'), indent=2)
+        entities.update(cur)
+    return entities
 
 def mediawiki_query(titles, params, site):
     if not titles:
         return []
 
     # avoid error: Too many values supplied for parameter "titles". The limit is 50.
-    if len(titles) > 50:
-        titles = titles[:50]
+    # FIXME: switch to utils.chunk
+    if len(titles) > page_size:
+        titles = titles[:page_size]
     base = {
         'format': 'json',
         'formatversion': 2,