diff --git a/integraality/pages_processor.py b/integraality/pages_processor.py index 9a41f04..e44211e 100644 --- a/integraality/pages_processor.py +++ b/integraality/pages_processor.py @@ -1,242 +1,243 @@ #!/usr/bin/python # -*- coding: utf-8 -*- """ Bot to generate statistics """ import os import re from redis import StrictRedis import pywikibot from cache import RedisCache from column import ColumnMaker, ColumnSyntaxException from grouping import ( GroupingConfigurationMaker, UnsupportedGroupingConfigurationException, ) from page_saving import save_to_wiki_or_local from property_statistics import PropertyStatistics from sparql_utils import QueryException, SparqlEngineBuilder REQUIRED_CONFIG_FIELDS = ["selector_sparql", "grouping_property", "properties"] class ProcessingException(Exception): pass class ConfigException(ProcessingException): pass class NoEndTemplateException(ProcessingException): pass class NoStartTemplateException(ProcessingException): pass class PagesProcessor: def __init__(self, url="https://www.wikidata.org/wiki/", cache_client=None): self.site = pywikibot.Site(url=url) self.repo = self.site.data_repository() self.template_name = "Property dashboard" self.end_template_name = "Property dashboard end" self.summary = "Update property usage stats" self.outputs = [] if not cache_client: host = os.getenv("REDIS_HOST", "tools-redis.svc.eqiad.wmflabs") cache_client = StrictRedis(host=host, decode_responses=False) self.cache = RedisCache(cache_client=cache_client) def make_cache_key(self, page_title): return ":".join([self.site.code, page_title]).replace(" ", "_") def get_all_pages(self): template = pywikibot.Page(self.site, self.template_name, ns=10) return template.getReferences(only_template_inclusion=True) @staticmethod def extract_elements_from_template_param(template_param): """Extract and sanitize the contents of a parsed template param.""" (field, _, value) = template_param.partition("=") return (field.strip(), value.replace("{{!}}", "|")) def parse_config_from_params(self, params): return { key: value for (key, value) in [ self.extract_elements_from_template_param(param) for param in params ] if key } def make_stats_object_arguments_for_page(self, page): all_templates_with_params = page.templatesWithParams() if self.template_name not in [ template.title(with_ns=False) for (template, _) in all_templates_with_params ]: msg = ( "No start template '%s' found." "The likely explanation is that inteGraality was invoked from a page that transcludes the page with the template. " "Please invoke inteGraality directly from the page with the template." % self.template_name ) raise NoStartTemplateException(msg) if self.end_template_name not in [ template.title(with_ns=False) for (template, _) in all_templates_with_params ]: raise NoEndTemplateException( "No end template '%s' provided" % self.end_template_name ) start_templates_with_params = [ (template, params) for (template, params) in all_templates_with_params if template.title(with_ns=False) == self.template_name ] if len(start_templates_with_params) > 1: pywikibot.warn("More than one template on the page %s" % page.title()) (template, params) = start_templates_with_params[0] parsed_config = self.parse_config_from_params(params) config = self.parse_config(parsed_config) key = self.make_cache_key(page.title()) self.cache.set_cache_value(key, config) return config def make_stats_object_for_page(self, page): config = self.make_stats_object_arguments_for_page(page) try: return PropertyStatistics(**config) except TypeError: raise ConfigException("The template parameters are incorrect.") def process_page(self, page): self.cache.invalidate(self.make_cache_key(page.title())) stats = self.make_stats_object_for_page(page) output = stats.retrieve_and_process_data() new_text = self.replace_in_page(output, page.get()) - save_to_wiki_or_local(page, self.summary, new_text) + summary = self.summary + f" using {stats.get_sparql_engine_name()}" + save_to_wiki_or_local(page, summary, new_text) def parse_config(self, config): for field in REQUIRED_CONFIG_FIELDS: if field not in config: pywikibot.output("Missing required field %s" % field) raise ConfigException("A required field is missing: %s" % field) config["columns"] = self.parse_config_properties(config["properties"]) del config["properties"] try: config["grouping_configuration"] = GroupingConfigurationMaker.make( self.repo, config.pop("grouping_property"), config.pop("higher_grouping", None), int(config.pop("grouping_threshold", 20)), config.pop("grouping_link", None), ) except UnsupportedGroupingConfigurationException as e: raise ConfigException(e) config["stats_for_no_group"] = bool(config.get("stats_for_no_group", False)) config["sparql_query_engine"] = SparqlEngineBuilder.make( config.pop("sparql_endpoint", None) ) return config @staticmethod def parse_config_properties(properties_string): properties = [x.strip() for x in properties_string.split(",")] properties_data = [] for prop in properties: try: (key, title) = prop.split(":") except ValueError: (key, title) = (prop, None) if key: try: properties_data.append(ColumnMaker.make(key, title)) except ColumnSyntaxException as e: raise ConfigException(e) return properties_data def replace_in_page(self, output, page_text): regex_text = f"({{{{{self.template_name}.*?(?", "PREFIX wdt: ", "PREFIX p: ", "PREFIX ps: ", "PREFIX pq: ", "PREFIX rdfs: ", "PREFIX schema: ", "PREFIX bd: ", "PREFIX wikibase: ", "PREFIX wdno: ", ] return "\n".join(prefixes) + "\n" + query class QLeverSparqlQueryEngine(SparqlQueryEngine): + name = "QLever" + def __init__(self, endpoint="https://qlever.dev/api/wikidata"): self.endpoint = endpoint def select(self, query): try: query = add_prefixes_to_query(query) params = {"query": query} response = requests.get(self.endpoint, params=params, timeout=30) response.raise_for_status() data = response.json() return self._transform_response(data) except requests.exceptions.HTTPError as e: raise QueryException( "QLever is not available, please try again later.", query=query, ) from e except (requests.exceptions.Timeout, requests.exceptions.RequestException): raise QueryException( "QLever timed out when running a SPARQL query." "You might be trying to do something too expensive.", query=query, ) def _transform_response(self, data): """Transform QLever response to expected format.""" if "results" in data and "bindings" in data["results"]: result = [] for binding in data["results"]["bindings"]: row = {} for var, value in binding.items(): row[var] = value["value"] result.append(row) return result return []