From 4f6f0ed0868729ce4d2aa4f1d8813e9197518d8e Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Sun, 14 Nov 2021 08:01:19 +0000 Subject: [PATCH] Remove unused code --- matcher/wikidata.py | 74 +++++++++++++++++++++++++++++++++++++++++++++ tests/test_utils.py | 7 +---- 2 files changed, 75 insertions(+), 6 deletions(-) diff --git a/matcher/wikidata.py b/matcher/wikidata.py index 9daea02..e81308c 100644 --- a/matcher/wikidata.py +++ b/matcher/wikidata.py @@ -1,3 +1,6 @@ +from collections import defaultdict +import re + hq_pid = "P159" coords_pid = "P625" @@ -50,3 +53,74 @@ def get_entity_coords(claims): hq_pid: read_hq_coords(claims), } return {pid: values for pid, values in ret.items() if values} + +def names_from_entity(entity, skip_lang=None): + if skip_lang is None: + skip_lang = set() + + ret = defaultdict(list) + cat_start = 'Category:' + + for k, v in entity['labels'].items(): + if k in skip_lang: + continue + ret[v['value']].append(('label', k)) + + for k, v in entity['sitelinks'].items(): + if k + 'wiki' in skip_lang: + continue + title = v['title'] + if title.startswith(cat_start): + title = title[len(cat_start):] + + first_letter = title[0] + if first_letter.isupper(): + lc_first_title = first_letter.lower() + title[1:] + if lc_first_title in ret: + title = lc_first_title + + ret[title].append(('sitelink', k)) + + for lang, value_list in entity.get('aliases', {}).items(): + if lang in skip_lang or len(value_list) > 3: + continue + for name in value_list: + ret[name['value']].append(('alias', lang)) + + commonscats = entity.get('claims', {}).get('P373', []) + for i in commonscats: + if 'datavalue' not in i['mainsnak']: + continue + value = i['mainsnak']['datavalue']['value'] + ret[value].append(('commonscat', None)) + + officialname = entity.get('claims', {}).get('P1448', []) + for i in officialname: + if 'datavalue' not in i['mainsnak']: + continue + value = i['mainsnak']['datavalue']['value'] + ret[value['text']].append(('officialname', value['language'])) + + nativelabel = entity.get('claims', {}).get('P1705', []) + for i in nativelabel: + if 'datavalue' not in i['mainsnak']: + continue + value = i['mainsnak']['datavalue']['value'] + ret[value['text']].append(('nativelabel', value['language'])) + + image = entity.get('claims', {}).get('P18', []) + for i in image: + if 'datavalue' not in i['mainsnak']: + continue + value = i['mainsnak']['datavalue']['value'] + m = re.search(r'\.[a-z]{3,4}$', value) + if m: + value = value[:m.start()] + for pattern in r' - geograph\.org\.uk - \d+$', r'[, -]*0\d{2,}$': + m = re.search(pattern, value) + if m: + value = value[:m.start()] + break + ret[value].append(('image', None)) + + return ret diff --git a/tests/test_utils.py b/tests/test_utils.py index cddcbe4..e091bc1 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,16 +1,11 @@ from matcher import utils -def test_pluralize_label(): - v = {"language": "en", "value": "building"} - assert utils.pluralize_label(v) == "buildings" - - v = {"language": "en", "value": "mine"} - assert utils.pluralize_label(v) == "mines" def test_format_wikibase_time_year(): v = {"time": "+1950-00-00T00:00:00Z", "precision": 9} assert utils.format_wikibase_time(v) == "1950" + def test_format_wikibase_time_century(): v = {"time": "+0800-00-00T00:00:00Z", "precision": 7} assert utils.format_wikibase_time(v) == "8th century"