Request 50 items at a time from wikibase

This commit is contained in:
Edward Betts 2019-10-14 10:45:48 +01:00
parent a5207834e1
commit 5941f28ebd
2 changed files with 51 additions and 35 deletions

View file

@ -1,5 +1,5 @@
from .model import HumanItem from .model import HumanItem
from . import mediawiki, utils, wikibase from . import mediawiki, wikibase
import re import re
re_four_digits = re.compile(r'\b\d{4}\b') re_four_digits = re.compile(r'\b\d{4}\b')
@ -38,20 +38,19 @@ def from_name(name):
qids = list(lookup.keys()) qids = list(lookup.keys())
found = [] found = []
for cur in utils.chunk(qids, 50): for entity in mediawiki.get_entities_with_cache(qids, props='labels|descriptions'):
for entity in mediawiki.get_entities_with_cache(cur, props='labels|descriptions'): qid = entity['id']
qid = entity['id'] item = lookup[qid]
item = lookup[qid] i = {
i = { 'qid': entity['id'],
'qid': entity['id'], 'year_of_birth': item.year_of_birth,
'year_of_birth': item.year_of_birth, 'year_of_death': item.year_of_death,
'year_of_death': item.year_of_death, }
} label = wikibase.get_entity_label(entity)
label = wikibase.get_entity_label(entity) if label:
if label: i['label'] = label
i['label'] = label if 'en' in entity['descriptions']:
if 'en' in entity['descriptions']: i['description'] = entity['descriptions']['en']['value']
i['description'] = entity['descriptions']['en']['value'] found.append(i)
found.append(i)
found.sort(key=lambda i: i.get('label', '')) found.sort(key=lambda i: i.get('label', ''))
return found return found

View file

@ -3,8 +3,10 @@ import os
import json import json
import hashlib import hashlib
from .category import Category from .category import Category
from . import utils
wikidata_url = 'https://www.wikidata.org/w/api.php' wikidata_url = 'https://www.wikidata.org/w/api.php'
page_size = 50
hosts = { hosts = {
'commons': 'commons.wikimedia.org', 'commons': 'commons.wikimedia.org',
@ -32,19 +34,7 @@ def get_entity(qid):
if 'missing' not in entity: if 'missing' not in entity:
return entity return entity
def get_entities(ids, **params): def wbgetentities(ids, **params):
if not ids:
return []
params = {
'action': 'wbgetentities',
'ids': '|'.join(ids),
**params,
}
r = api_call(params)
json_data = r.json()
return list(json_data['entities'].values())
def get_entities_dict(ids, **params):
if not ids: if not ids:
return [] return []
params = { params = {
@ -54,6 +44,18 @@ def get_entities_dict(ids, **params):
} }
return api_call(params).json()['entities'] return api_call(params).json()['entities']
def get_entities(ids, **params):
entity_list = []
for cur in utils.chunk(ids, page_size):
entity_list += wbgetentities(cur, **params).values()
return entity_list
def get_entities_dict(ids, **params):
entities = {}
for cur in utils.chunk(ids, page_size):
entities.update(wbgetentities(cur, **params))
return entities
def get_entity_with_cache(qid, refresh=False): def get_entity_with_cache(qid, refresh=False):
filename = f'cache/{qid}.json' filename = f'cache/{qid}.json'
if not refresh and os.path.exists(filename): if not refresh and os.path.exists(filename):
@ -69,20 +71,35 @@ def get_entities_with_cache(ids, **params):
filename = f'cache/entities_{md5}.json' filename = f'cache/entities_{md5}.json'
if os.path.exists(filename): if os.path.exists(filename):
entity = json.load(open(filename)) entity_list = json.load(open(filename))
else: else:
entity = get_entities(ids, **params) entity_list = get_entities(ids, **params)
json.dump(entity, open(filename, 'w'), indent=2) json.dump(entity_list, open(filename, 'w'), indent=2)
return entity return entity_list
def get_entities_dict_with_cache(all_ids, **params):
entities = {}
for ids in utils.chunk(all_ids, page_size):
md5 = hashlib.md5(' '.join(ids).encode('utf-8')).hexdigest()
filename = f'cache/entities_dict_{md5}.json'
if os.path.exists(filename):
entities.update(json.load(open(filename)))
continue
cur = wbgetentities(ids, **params)
json.dump(cur, open(filename, 'w'), indent=2)
entities.update(cur)
return entities
def mediawiki_query(titles, params, site): def mediawiki_query(titles, params, site):
if not titles: if not titles:
return [] return []
# avoid error: Too many values supplied for parameter "titles". The limit is 50. # avoid error: Too many values supplied for parameter "titles". The limit is 50.
if len(titles) > 50: # FIXME: switch to utils.chunk
titles = titles[:50] if len(titles) > page_size:
titles = titles[:page_size]
base = { base = {
'format': 'json', 'format': 'json',
'formatversion': 2, 'formatversion': 2,