Request 50 items at a time from wikibase
This commit is contained in:
parent
a5207834e1
commit
5941f28ebd
|
@ -1,5 +1,5 @@
|
||||||
from .model import HumanItem
|
from .model import HumanItem
|
||||||
from . import mediawiki, utils, wikibase
|
from . import mediawiki, wikibase
|
||||||
import re
|
import re
|
||||||
|
|
||||||
re_four_digits = re.compile(r'\b\d{4}\b')
|
re_four_digits = re.compile(r'\b\d{4}\b')
|
||||||
|
@ -38,20 +38,19 @@ def from_name(name):
|
||||||
qids = list(lookup.keys())
|
qids = list(lookup.keys())
|
||||||
|
|
||||||
found = []
|
found = []
|
||||||
for cur in utils.chunk(qids, 50):
|
for entity in mediawiki.get_entities_with_cache(qids, props='labels|descriptions'):
|
||||||
for entity in mediawiki.get_entities_with_cache(cur, props='labels|descriptions'):
|
qid = entity['id']
|
||||||
qid = entity['id']
|
item = lookup[qid]
|
||||||
item = lookup[qid]
|
i = {
|
||||||
i = {
|
'qid': entity['id'],
|
||||||
'qid': entity['id'],
|
'year_of_birth': item.year_of_birth,
|
||||||
'year_of_birth': item.year_of_birth,
|
'year_of_death': item.year_of_death,
|
||||||
'year_of_death': item.year_of_death,
|
}
|
||||||
}
|
label = wikibase.get_entity_label(entity)
|
||||||
label = wikibase.get_entity_label(entity)
|
if label:
|
||||||
if label:
|
i['label'] = label
|
||||||
i['label'] = label
|
if 'en' in entity['descriptions']:
|
||||||
if 'en' in entity['descriptions']:
|
i['description'] = entity['descriptions']['en']['value']
|
||||||
i['description'] = entity['descriptions']['en']['value']
|
found.append(i)
|
||||||
found.append(i)
|
|
||||||
found.sort(key=lambda i: i.get('label', ''))
|
found.sort(key=lambda i: i.get('label', ''))
|
||||||
return found
|
return found
|
||||||
|
|
|
@ -3,8 +3,10 @@ import os
|
||||||
import json
|
import json
|
||||||
import hashlib
|
import hashlib
|
||||||
from .category import Category
|
from .category import Category
|
||||||
|
from . import utils
|
||||||
|
|
||||||
wikidata_url = 'https://www.wikidata.org/w/api.php'
|
wikidata_url = 'https://www.wikidata.org/w/api.php'
|
||||||
|
page_size = 50
|
||||||
|
|
||||||
hosts = {
|
hosts = {
|
||||||
'commons': 'commons.wikimedia.org',
|
'commons': 'commons.wikimedia.org',
|
||||||
|
@ -32,19 +34,7 @@ def get_entity(qid):
|
||||||
if 'missing' not in entity:
|
if 'missing' not in entity:
|
||||||
return entity
|
return entity
|
||||||
|
|
||||||
def get_entities(ids, **params):
|
def wbgetentities(ids, **params):
|
||||||
if not ids:
|
|
||||||
return []
|
|
||||||
params = {
|
|
||||||
'action': 'wbgetentities',
|
|
||||||
'ids': '|'.join(ids),
|
|
||||||
**params,
|
|
||||||
}
|
|
||||||
r = api_call(params)
|
|
||||||
json_data = r.json()
|
|
||||||
return list(json_data['entities'].values())
|
|
||||||
|
|
||||||
def get_entities_dict(ids, **params):
|
|
||||||
if not ids:
|
if not ids:
|
||||||
return []
|
return []
|
||||||
params = {
|
params = {
|
||||||
|
@ -54,6 +44,18 @@ def get_entities_dict(ids, **params):
|
||||||
}
|
}
|
||||||
return api_call(params).json()['entities']
|
return api_call(params).json()['entities']
|
||||||
|
|
||||||
|
def get_entities(ids, **params):
|
||||||
|
entity_list = []
|
||||||
|
for cur in utils.chunk(ids, page_size):
|
||||||
|
entity_list += wbgetentities(cur, **params).values()
|
||||||
|
return entity_list
|
||||||
|
|
||||||
|
def get_entities_dict(ids, **params):
|
||||||
|
entities = {}
|
||||||
|
for cur in utils.chunk(ids, page_size):
|
||||||
|
entities.update(wbgetentities(cur, **params))
|
||||||
|
return entities
|
||||||
|
|
||||||
def get_entity_with_cache(qid, refresh=False):
|
def get_entity_with_cache(qid, refresh=False):
|
||||||
filename = f'cache/{qid}.json'
|
filename = f'cache/{qid}.json'
|
||||||
if not refresh and os.path.exists(filename):
|
if not refresh and os.path.exists(filename):
|
||||||
|
@ -69,20 +71,35 @@ def get_entities_with_cache(ids, **params):
|
||||||
|
|
||||||
filename = f'cache/entities_{md5}.json'
|
filename = f'cache/entities_{md5}.json'
|
||||||
if os.path.exists(filename):
|
if os.path.exists(filename):
|
||||||
entity = json.load(open(filename))
|
entity_list = json.load(open(filename))
|
||||||
else:
|
else:
|
||||||
entity = get_entities(ids, **params)
|
entity_list = get_entities(ids, **params)
|
||||||
json.dump(entity, open(filename, 'w'), indent=2)
|
json.dump(entity_list, open(filename, 'w'), indent=2)
|
||||||
|
|
||||||
return entity
|
return entity_list
|
||||||
|
|
||||||
|
def get_entities_dict_with_cache(all_ids, **params):
|
||||||
|
entities = {}
|
||||||
|
for ids in utils.chunk(all_ids, page_size):
|
||||||
|
md5 = hashlib.md5(' '.join(ids).encode('utf-8')).hexdigest()
|
||||||
|
|
||||||
|
filename = f'cache/entities_dict_{md5}.json'
|
||||||
|
if os.path.exists(filename):
|
||||||
|
entities.update(json.load(open(filename)))
|
||||||
|
continue
|
||||||
|
cur = wbgetentities(ids, **params)
|
||||||
|
json.dump(cur, open(filename, 'w'), indent=2)
|
||||||
|
entities.update(cur)
|
||||||
|
return entities
|
||||||
|
|
||||||
def mediawiki_query(titles, params, site):
|
def mediawiki_query(titles, params, site):
|
||||||
if not titles:
|
if not titles:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# avoid error: Too many values supplied for parameter "titles". The limit is 50.
|
# avoid error: Too many values supplied for parameter "titles". The limit is 50.
|
||||||
if len(titles) > 50:
|
# FIXME: switch to utils.chunk
|
||||||
titles = titles[:50]
|
if len(titles) > page_size:
|
||||||
|
titles = titles[:page_size]
|
||||||
base = {
|
base = {
|
||||||
'format': 'json',
|
'format': 'json',
|
||||||
'formatversion': 2,
|
'formatversion': 2,
|
||||||
|
|
Loading…
Reference in a new issue