#!/usr/bin/env python3 from SPARQLWrapper import SPARQLWrapper, JSON endpoint_url = 'https://query.wikidata.org/sparql' user_agent = 'T271126 analysis (lucas.werkmeister@wikimedia.de)' sparql = SPARQLWrapper(endpoint_url, agent=user_agent) sparql.setReturnFormat(JSON) def get_external_id_property_ids(): query = ''' SELECT ?property WHERE { ?property wikibase:propertyType wikibase:ExternalId. } ''' sparql.setQuery(query) results = sparql.query().convert()['results']['bindings'] return [result['property']['value'][len('http://www.wikidata.org/entity/'):] for result in results] def get_counts(property_id): query = ''' SELECT (SUM(IF(COALESCE(CONTAINS(?id, " "), false), 1, 0)) AS ?withSpace) (COUNT(*) AS ?total) WHERE { SERVICE bd:sample { ?subject wdt:%s ?id. bd:serviceParam bd:sample.limit 10000 } } ''' % property_id sparql.setQuery(query) result = sparql.query().convert()['results']['bindings'][0] return int(result['withSpace']['value']), int(result['total']['value']) properties_without_space = 0 properties_with_space = {} properties_with_error = {} external_property_ids = get_external_id_property_ids() try: from progress.bar import IncrementalBar property_ids = IncrementalBar('Running', suffix='%(index)d/%(max)d, %(eta_td)s remaining').iter(external_property_ids) except ImportError: property_ids = external_property_ids for property_id in property_ids: try: with_space, total = get_counts(property_id) except Exception as e: properties_with_error[property_id] = e else: if with_space: properties_with_space[property_id] = with_space / total else: properties_without_space += 1 if properties_with_error: print(f'Errors encountered with the following {len(properties_with_error)} properties:') for property_id, e in properties_with_error.items(): print(property_id) print(e) if properties_with_space: print(f'Spaces found in the following {len(properties_with_space)} properties:') for property_id, ratio in sorted(properties_with_space.items(), key=lambda item: item[1]): print(f'{property_id}: {ratio * 100:.3}%') print(f'No spaces found in {properties_without_space} out of {properties_without_space + len(properties_with_space) + len(properties_with_error)} IDs.')