diff --git a/integraality/pages_processor.py b/integraality/pages_processor.py
index 9a41f04..e44211e 100644
--- a/integraality/pages_processor.py
+++ b/integraality/pages_processor.py
@@ -1,242 +1,243 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Bot to generate statistics
"""
import os
import re
from redis import StrictRedis
import pywikibot
from cache import RedisCache
from column import ColumnMaker, ColumnSyntaxException
from grouping import (
GroupingConfigurationMaker,
UnsupportedGroupingConfigurationException,
)
from page_saving import save_to_wiki_or_local
from property_statistics import PropertyStatistics
from sparql_utils import QueryException, SparqlEngineBuilder
REQUIRED_CONFIG_FIELDS = ["selector_sparql", "grouping_property", "properties"]
class ProcessingException(Exception):
pass
class ConfigException(ProcessingException):
pass
class NoEndTemplateException(ProcessingException):
pass
class NoStartTemplateException(ProcessingException):
pass
class PagesProcessor:
def __init__(self, url="https://www.wikidata.org/wiki/", cache_client=None):
self.site = pywikibot.Site(url=url)
self.repo = self.site.data_repository()
self.template_name = "Property dashboard"
self.end_template_name = "Property dashboard end"
self.summary = "Update property usage stats"
self.outputs = []
if not cache_client:
host = os.getenv("REDIS_HOST", "tools-redis.svc.eqiad.wmflabs")
cache_client = StrictRedis(host=host, decode_responses=False)
self.cache = RedisCache(cache_client=cache_client)
def make_cache_key(self, page_title):
return ":".join([self.site.code, page_title]).replace(" ", "_")
def get_all_pages(self):
template = pywikibot.Page(self.site, self.template_name, ns=10)
return template.getReferences(only_template_inclusion=True)
@staticmethod
def extract_elements_from_template_param(template_param):
"""Extract and sanitize the contents of a parsed template param."""
(field, _, value) = template_param.partition("=")
return (field.strip(), value.replace("{{!}}", "|"))
def parse_config_from_params(self, params):
return {
key: value
for (key, value) in [
self.extract_elements_from_template_param(param) for param in params
]
if key
}
def make_stats_object_arguments_for_page(self, page):
all_templates_with_params = page.templatesWithParams()
if self.template_name not in [
template.title(with_ns=False) for (template, _) in all_templates_with_params
]:
msg = (
"No start template '%s' found."
"The likely explanation is that inteGraality was invoked from a page that transcludes the page with the template. "
"Please invoke inteGraality directly from the page with the template."
% self.template_name
)
raise NoStartTemplateException(msg)
if self.end_template_name not in [
template.title(with_ns=False) for (template, _) in all_templates_with_params
]:
raise NoEndTemplateException(
"No end template '%s' provided" % self.end_template_name
)
start_templates_with_params = [
(template, params)
for (template, params) in all_templates_with_params
if template.title(with_ns=False) == self.template_name
]
if len(start_templates_with_params) > 1:
pywikibot.warn("More than one template on the page %s" % page.title())
(template, params) = start_templates_with_params[0]
parsed_config = self.parse_config_from_params(params)
config = self.parse_config(parsed_config)
key = self.make_cache_key(page.title())
self.cache.set_cache_value(key, config)
return config
def make_stats_object_for_page(self, page):
config = self.make_stats_object_arguments_for_page(page)
try:
return PropertyStatistics(**config)
except TypeError:
raise ConfigException("The template parameters are incorrect.")
def process_page(self, page):
self.cache.invalidate(self.make_cache_key(page.title()))
stats = self.make_stats_object_for_page(page)
output = stats.retrieve_and_process_data()
new_text = self.replace_in_page(output, page.get())
- save_to_wiki_or_local(page, self.summary, new_text)
+ summary = self.summary + f" using {stats.get_sparql_engine_name()}"
+ save_to_wiki_or_local(page, summary, new_text)
def parse_config(self, config):
for field in REQUIRED_CONFIG_FIELDS:
if field not in config:
pywikibot.output("Missing required field %s" % field)
raise ConfigException("A required field is missing: %s" % field)
config["columns"] = self.parse_config_properties(config["properties"])
del config["properties"]
try:
config["grouping_configuration"] = GroupingConfigurationMaker.make(
self.repo,
config.pop("grouping_property"),
config.pop("higher_grouping", None),
int(config.pop("grouping_threshold", 20)),
config.pop("grouping_link", None),
)
except UnsupportedGroupingConfigurationException as e:
raise ConfigException(e)
config["stats_for_no_group"] = bool(config.get("stats_for_no_group", False))
config["sparql_query_engine"] = SparqlEngineBuilder.make(
config.pop("sparql_endpoint", None)
)
return config
@staticmethod
def parse_config_properties(properties_string):
properties = [x.strip() for x in properties_string.split(",")]
properties_data = []
for prop in properties:
try:
(key, title) = prop.split(":")
except ValueError:
(key, title) = (prop, None)
if key:
try:
properties_data.append(ColumnMaker.make(key, title))
except ColumnSyntaxException as e:
raise ConfigException(e)
return properties_data
def replace_in_page(self, output, page_text):
regex_text = f"({{{{{self.template_name}.*?(?",
"PREFIX wdt: ",
"PREFIX p: ",
"PREFIX ps: ",
"PREFIX pq: ",
"PREFIX rdfs: ",
"PREFIX schema: ",
"PREFIX bd: ",
"PREFIX wikibase: ",
"PREFIX wdno: ",
]
return "\n".join(prefixes) + "\n" + query
class QLeverSparqlQueryEngine(SparqlQueryEngine):
+ name = "QLever"
+
def __init__(self, endpoint="https://qlever.dev/api/wikidata"):
self.endpoint = endpoint
def select(self, query):
try:
query = add_prefixes_to_query(query)
params = {"query": query}
response = requests.get(self.endpoint, params=params, timeout=30)
response.raise_for_status()
data = response.json()
return self._transform_response(data)
except requests.exceptions.HTTPError as e:
raise QueryException(
"QLever is not available, please try again later.",
query=query,
) from e
except (requests.exceptions.Timeout, requests.exceptions.RequestException):
raise QueryException(
"QLever timed out when running a SPARQL query."
"You might be trying to do something too expensive.",
query=query,
)
def _transform_response(self, data):
"""Transform QLever response to expected format."""
if "results" in data and "bindings" in data["results"]:
result = []
for binding in data["results"]["bindings"]:
row = {}
for var, value in binding.items():
row[var] = value["value"]
result.append(row)
return result
return []