From 5941f28ebd584bfa2ffdeac8c174b4aae6ed0d34 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Mon, 14 Oct 2019 10:45:48 +0100 Subject: [PATCH] Request 50 items at a time from wikibase --- depicts/human.py | 31 ++++++++++++------------- depicts/mediawiki.py | 55 +++++++++++++++++++++++++++++--------------- 2 files changed, 51 insertions(+), 35 deletions(-) diff --git a/depicts/human.py b/depicts/human.py index 5826bc6..60544eb 100644 --- a/depicts/human.py +++ b/depicts/human.py @@ -1,5 +1,5 @@ from .model import HumanItem -from . import mediawiki, utils, wikibase +from . import mediawiki, wikibase import re re_four_digits = re.compile(r'\b\d{4}\b') @@ -38,20 +38,19 @@ def from_name(name): qids = list(lookup.keys()) found = [] - for cur in utils.chunk(qids, 50): - for entity in mediawiki.get_entities_with_cache(cur, props='labels|descriptions'): - qid = entity['id'] - item = lookup[qid] - i = { - 'qid': entity['id'], - 'year_of_birth': item.year_of_birth, - 'year_of_death': item.year_of_death, - } - label = wikibase.get_entity_label(entity) - if label: - i['label'] = label - if 'en' in entity['descriptions']: - i['description'] = entity['descriptions']['en']['value'] - found.append(i) + for entity in mediawiki.get_entities_with_cache(qids, props='labels|descriptions'): + qid = entity['id'] + item = lookup[qid] + i = { + 'qid': entity['id'], + 'year_of_birth': item.year_of_birth, + 'year_of_death': item.year_of_death, + } + label = wikibase.get_entity_label(entity) + if label: + i['label'] = label + if 'en' in entity['descriptions']: + i['description'] = entity['descriptions']['en']['value'] + found.append(i) found.sort(key=lambda i: i.get('label', '')) return found diff --git a/depicts/mediawiki.py b/depicts/mediawiki.py index ada56f3..588ea52 100644 --- a/depicts/mediawiki.py +++ b/depicts/mediawiki.py @@ -3,8 +3,10 @@ import os import json import hashlib from .category import Category +from . import utils wikidata_url = 'https://www.wikidata.org/w/api.php' +page_size = 50 hosts = { 'commons': 'commons.wikimedia.org', @@ -32,19 +34,7 @@ def get_entity(qid): if 'missing' not in entity: return entity -def get_entities(ids, **params): - if not ids: - return [] - params = { - 'action': 'wbgetentities', - 'ids': '|'.join(ids), - **params, - } - r = api_call(params) - json_data = r.json() - return list(json_data['entities'].values()) - -def get_entities_dict(ids, **params): +def wbgetentities(ids, **params): if not ids: return [] params = { @@ -54,6 +44,18 @@ def get_entities_dict(ids, **params): } return api_call(params).json()['entities'] +def get_entities(ids, **params): + entity_list = [] + for cur in utils.chunk(ids, page_size): + entity_list += wbgetentities(cur, **params).values() + return entity_list + +def get_entities_dict(ids, **params): + entities = {} + for cur in utils.chunk(ids, page_size): + entities.update(wbgetentities(cur, **params)) + return entities + def get_entity_with_cache(qid, refresh=False): filename = f'cache/{qid}.json' if not refresh and os.path.exists(filename): @@ -69,20 +71,35 @@ def get_entities_with_cache(ids, **params): filename = f'cache/entities_{md5}.json' if os.path.exists(filename): - entity = json.load(open(filename)) + entity_list = json.load(open(filename)) else: - entity = get_entities(ids, **params) - json.dump(entity, open(filename, 'w'), indent=2) + entity_list = get_entities(ids, **params) + json.dump(entity_list, open(filename, 'w'), indent=2) - return entity + return entity_list + +def get_entities_dict_with_cache(all_ids, **params): + entities = {} + for ids in utils.chunk(all_ids, page_size): + md5 = hashlib.md5(' '.join(ids).encode('utf-8')).hexdigest() + + filename = f'cache/entities_dict_{md5}.json' + if os.path.exists(filename): + entities.update(json.load(open(filename))) + continue + cur = wbgetentities(ids, **params) + json.dump(cur, open(filename, 'w'), indent=2) + entities.update(cur) + return entities def mediawiki_query(titles, params, site): if not titles: return [] # avoid error: Too many values supplied for parameter "titles". The limit is 50. - if len(titles) > 50: - titles = titles[:50] + # FIXME: switch to utils.chunk + if len(titles) > page_size: + titles = titles[:page_size] base = { 'format': 'json', 'formatversion': 2,