Request 50 items at a time from wikibase

2019-10-14 10:45:48 +01:00 · 2019-10-14 10:45:48 +01:00 · 5941f28ebd
parent a5207834e1
commit 5941f28ebd
2 changed files with 51 additions and 35 deletions
--- a/depicts/human.py
+++ b/depicts/human.py
@ -1,5 +1,5 @@
 from .model import HumanItem
-from . import mediawiki, utils, wikibase
+from . import mediawiki, wikibase
 import re
 re_four_digits = re.compile(r'\b\d{4}\b')
@ -38,20 +38,19 @@ def from_name(name):
    qids = list(lookup.keys())
    found = []
-    for cur in utils.chunk(qids, 50):
+    for entity in mediawiki.get_entities_with_cache(qids, props='labels|descriptions'):
-        for entity in mediawiki.get_entities_with_cache(cur, props='labels|descriptions'):
+        qid = entity['id']
-            qid = entity['id']
+        item = lookup[qid]
-            item = lookup[qid]
+        i = {
-            i = {
+            'qid': entity['id'],
-                'qid': entity['id'],
+            'year_of_birth': item.year_of_birth,
-                'year_of_birth': item.year_of_birth,
+            'year_of_death': item.year_of_death,
-                'year_of_death': item.year_of_death,
+        }
-            }
+        label = wikibase.get_entity_label(entity)
-            label = wikibase.get_entity_label(entity)
+        if label:
-            if label:
+            i['label'] = label
-                i['label'] = label
+        if 'en' in entity['descriptions']:
-            if 'en' in entity['descriptions']:
+            i['description'] = entity['descriptions']['en']['value']
-                i['description'] = entity['descriptions']['en']['value']
+        found.append(i)
            found.append(i)
    found.sort(key=lambda i: i.get('label', ''))
    return found
--- a/depicts/mediawiki.py
+++ b/depicts/mediawiki.py
@ -3,8 +3,10 @@ import os
 import json
 import hashlib
 from .category import Category
 from . import utils
 wikidata_url = 'https://www.wikidata.org/w/api.php'
 page_size = 50
 hosts = {
    'commons': 'commons.wikimedia.org',
@ -32,19 +34,7 @@ def get_entity(qid):
    if 'missing' not in entity:
        return entity
-def get_entities(ids, **params):
+def wbgetentities(ids, **params):
    if not ids:
        return []
    params = {
        'action': 'wbgetentities',
        'ids': '|'.join(ids),
        **params,
    }
    r = api_call(params)
    json_data = r.json()
    return list(json_data['entities'].values())
 def get_entities_dict(ids, **params):
    if not ids:
        return []
    params = {
@ -54,6 +44,18 @@ def get_entities_dict(ids, **params):
    }
    return api_call(params).json()['entities']
 def get_entities(ids, **params):
    entity_list = []
    for cur in utils.chunk(ids, page_size):
        entity_list += wbgetentities(cur, **params).values()
    return entity_list
 def get_entities_dict(ids, **params):
    entities = {}
    for cur in utils.chunk(ids, page_size):
        entities.update(wbgetentities(cur, **params))
    return entities
 def get_entity_with_cache(qid, refresh=False):
    filename = f'cache/{qid}.json'
    if not refresh and os.path.exists(filename):
@ -69,20 +71,35 @@ def get_entities_with_cache(ids, **params):
    filename = f'cache/entities_{md5}.json'
    if os.path.exists(filename):
-        entity = json.load(open(filename))
+        entity_list = json.load(open(filename))
    else:
-        entity = get_entities(ids, **params)
+        entity_list = get_entities(ids, **params)
-        json.dump(entity, open(filename, 'w'), indent=2)
+        json.dump(entity_list, open(filename, 'w'), indent=2)
-    return entity
+    return entity_list
 def get_entities_dict_with_cache(all_ids, **params):
    entities = {}
    for ids in utils.chunk(all_ids, page_size):
        md5 = hashlib.md5(' '.join(ids).encode('utf-8')).hexdigest()
        filename = f'cache/entities_dict_{md5}.json'
        if os.path.exists(filename):
            entities.update(json.load(open(filename)))
            continue
        cur = wbgetentities(ids, **params)
        json.dump(cur, open(filename, 'w'), indent=2)
        entities.update(cur)
    return entities
 def mediawiki_query(titles, params, site):
    if not titles:
        return []
    # avoid error: Too many values supplied for parameter "titles". The limit is 50.
-    if len(titles) > 50:
+    # FIXME: switch to utils.chunk
-        titles = titles[:50]
+    if len(titles) > page_size:
        titles = titles[:page_size]
    base = {
        'format': 'json',
        'formatversion': 2,