Add code for downloading Wikidata items
This commit is contained in:
		
							parent
							
								
									61ecfdef8b
								
							
						
					
					
						commit
						8fea2a8aa7
					
				
							
								
								
									
										52
									
								
								matcher/wikidata.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								matcher/wikidata.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,52 @@
 | 
			
		|||
hq_pid = "P159"
 | 
			
		||||
coords_pid = "P625"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def read_coords(snak):
 | 
			
		||||
    try:
 | 
			
		||||
        v = snak["datavalue"]["value"]
 | 
			
		||||
    except KeyError:
 | 
			
		||||
        return
 | 
			
		||||
    if v["globe"].rpartition("/")[2] != "Q2":
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    return {k: v[k] for k in ("latitude", "longitude")}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def read_hq_coords(claims):
 | 
			
		||||
    if hq_pid not in claims:
 | 
			
		||||
        return []
 | 
			
		||||
 | 
			
		||||
    found = []
 | 
			
		||||
    for hq_claim in claims[hq_pid]:
 | 
			
		||||
        if "qualifiers" not in hq_claim:
 | 
			
		||||
            continue
 | 
			
		||||
        if coords_pid not in hq_claim["qualifiers"]:
 | 
			
		||||
            continue
 | 
			
		||||
        for snak in hq_claim["qualifiers"][coords_pid]:
 | 
			
		||||
            coords = read_coords(snak)
 | 
			
		||||
            if coords:
 | 
			
		||||
                found.append(coords)
 | 
			
		||||
 | 
			
		||||
    return found
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def read_location_statement(claims, pid):
 | 
			
		||||
    if pid not in claims:
 | 
			
		||||
        return []
 | 
			
		||||
 | 
			
		||||
    found = []
 | 
			
		||||
    for statement in claims[pid]:
 | 
			
		||||
        coords = read_coords(statement["mainsnak"])
 | 
			
		||||
        if coords:
 | 
			
		||||
            found.append(coords)
 | 
			
		||||
    return found
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_entity_coords(claims):
 | 
			
		||||
    assert "claims" not in claims  # make sure we weren't passed entity by mistake
 | 
			
		||||
    ret = {
 | 
			
		||||
        coords_pid: read_location_statement(claims, coords_pid),
 | 
			
		||||
        hq_pid: read_hq_coords(claims),
 | 
			
		||||
    }
 | 
			
		||||
    return {pid: values for pid, values in ret.items() if values}
 | 
			
		||||
							
								
								
									
										66
									
								
								matcher/wikidata_api.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										66
									
								
								matcher/wikidata_api.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,66 @@
 | 
			
		|||
import requests
 | 
			
		||||
import json
 | 
			
		||||
 | 
			
		||||
wd_api_url = "https://www.wikidata.org/w/api.php"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def api_get(params):
 | 
			
		||||
    base_params = {
 | 
			
		||||
        "format": "json",
 | 
			
		||||
        "formatversion": 2,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return requests.get(wd_api_url, params={**base_params, **params})
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_revision_timestamp(revid):
 | 
			
		||||
    params = {
 | 
			
		||||
        "action": "query",
 | 
			
		||||
        "prop": "revisions",
 | 
			
		||||
        "revids": revid,
 | 
			
		||||
        "rvprop": "ids|timestamp",
 | 
			
		||||
    }
 | 
			
		||||
    r = api_get(params)
 | 
			
		||||
    rev = r.json()["query"]["pages"][0]["revisions"][0]
 | 
			
		||||
    assert rev["revid"] == int(revid)
 | 
			
		||||
    return rev["timestamp"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_recent_changes(**kwargs):
 | 
			
		||||
    props = [
 | 
			
		||||
        "title",
 | 
			
		||||
        "ids",
 | 
			
		||||
        "comment",
 | 
			
		||||
        "parsedcomment",
 | 
			
		||||
        "timestamp",
 | 
			
		||||
        "redirect",
 | 
			
		||||
        "loginfo",
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    params = {
 | 
			
		||||
        "action": "query",
 | 
			
		||||
        "list": "recentchanges",
 | 
			
		||||
        "rcnamespace": 0,
 | 
			
		||||
        # "rctype": "log",
 | 
			
		||||
        # "rclimit": "max",
 | 
			
		||||
        "rclimit": "max",
 | 
			
		||||
        # "rcstart": start,
 | 
			
		||||
        "rcdir": "newer",
 | 
			
		||||
        "rcprop": "|".join(props),
 | 
			
		||||
        **{k: v for k, v in kwargs.items() if v},
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return api_get(params)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_entity(qid):
 | 
			
		||||
    data = api_get({"action": "wbgetentities", "ids": qid}).json()
 | 
			
		||||
    if "entities" not in data:
 | 
			
		||||
        print(json.dumps(data, indent=2))
 | 
			
		||||
    return data["entities"][qid]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_entities(ids):
 | 
			
		||||
    r = api_get({"action": "wbgetentities", "ids": "|".join(ids)})
 | 
			
		||||
    for qid, entity in r.json()["entities"].items():
 | 
			
		||||
        yield qid, entity
 | 
			
		||||
		Loading…
	
		Reference in a new issue