#!/usr/bin/env python3 from SPARQLWrapper import SPARQLWrapper, JSON endpoint_url = 'https://query.wikidata.org/sparql' user_agent = 'T271126 analysis (lucas.werkmeister@wikimedia.de)' sparql = SPARQLWrapper(endpoint_url, agent=user_agent) sparql.setReturnFormat(JSON) def get_external_id_property_ids(): query = ''' SELECT ?property WHERE { ?property wikibase:propertyType wikibase:ExternalId. FILTER EXISTS { ?property wdt:P1630 ?formatterUrl. } } ''' sparql.setQuery(query) results = sparql.query().convert()['results']['bindings'] return [result['property']['value'][len('http://www.wikidata.org/entity/'):] for result in results] def get_counts(property_id): query = ''' SELECT (SUM(IF(COALESCE(CONTAINS(?id, " "), false), 1, 0)) AS ?withSpace) (COUNT(*) AS ?total) WHERE { SERVICE bd:sample { ?subject wdt:%s ?id. bd:serviceParam bd:sample.limit 10000 } } ''' % property_id sparql.setQuery(query) result = sparql.query().convert()['results']['bindings'][0] return int(result['withSpace']['value']), int(result['total']['value']) properties_without_space = 0 properties_with_space = {} properties_with_error = {} external_property_ids = get_external_id_property_ids()[:50] try: from progress.bar import IncrementalBar property_ids = IncrementalBar('Running', suffix='%(index)d/%(max)d, %(eta_td)s remaining').iter(external_property_ids) except ImportError: property_ids = external_property_ids for property_id in property_ids: try: with_space, total = get_counts(property_id) except Exception as e: properties_with_error[property_id] = e else: if with_space: properties_with_space[property_id] = (with_space, total) else: properties_without_space += 1 if properties_with_error: print(f'Errors encountered with the following {len(properties_with_error)} properties:') for property_id, e in properties_with_error.items(): print(property_id) print(e) if properties_with_space: print(f'Spaces found in the following {len(properties_with_space)} properties:') for property_id, (with_space, total) in sorted(properties_with_space.items(), key=lambda item: (item[1][0] / item[1][1], item[1][1])): ratio = with_space / total print(f'{property_id:>5}: {ratio * 100:6.2f}% ({with_space:5}/{total:5})') print(f'No spaces found in {properties_without_space} out of {properties_without_space + len(properties_with_space) + len(properties_with_error)} IDs.')