import json import requests def get_source_search(term): res = requests.get('https://commons.wikimedia.org/wiki/Special:Search', params={ 'cirrusDumpQuery': 1, 'fulltext': 1, 'search': term }) try: return res.json() except: print(res.text) raise def augment_search(search): query = search['__main__']['query'] del query['highlight'] query['size'] = 0 query['aggs'] = { 'depicts': { 'terms': { 'field': 'statement_keywords' } } } return query def run_elastic_query(search_query): url = 'http://localhost:9200/commonswiki_file/page/_search' res = requests.post(url, data=json.dumps(search_query), headers={ 'Content-Type': 'application/json' }) return res.json() def extract_qids(results): for bucket in results['aggregations']['depicts']['buckets']: q_item = bucket['key'].split('=', 1)[1] count = bucket['doc_count'] yield (q_item, count) def augment_qids(qids): titles = [] for qid, count in qids: titles.append(qid) res = requests.get('https://www.wikidata.org/w/api.php', params={ 'action': 'wbgetentities', 'ids': '|'.join(titles), 'props': 'labels', 'format': 'json', 'formatversion': 2, }) out = {} for qid, data in res.json()['entities'].items(): try: label = data['labels']['en'] except KeyError: label = next(iter(data['labels'].values())) out[qid] = label['value'] return out def doit(term): source = get_source_search(term) final = augment_search(source) results = run_elastic_query(final) qids = list(extract_qids(results)) augmented = augment_qids(qids) for qid, count in qids: yield (qid, augmented[qid], count) if __name__ == "__main__": from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument('term') args = parser.parse_args() print('|count|qid|label') for qid, label, count in doit(args.term): print('|{: 5d}|{:10s}|{}'.format(count, qid, label))