diff --git a/integraality/pages_processor.py b/integraality/pages_processor.py
index 0552d5e..9a41f04 100644
--- a/integraality/pages_processor.py
+++ b/integraality/pages_processor.py
@@ -1,239 +1,242 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Bot to generate statistics
"""
import os
import re
from redis import StrictRedis
import pywikibot
from cache import RedisCache
from column import ColumnMaker, ColumnSyntaxException
from grouping import (
GroupingConfigurationMaker,
UnsupportedGroupingConfigurationException,
)
from page_saving import save_to_wiki_or_local
from property_statistics import PropertyStatistics
-from sparql_utils import QueryException
+from sparql_utils import QueryException, SparqlEngineBuilder
REQUIRED_CONFIG_FIELDS = ["selector_sparql", "grouping_property", "properties"]
class ProcessingException(Exception):
pass
class ConfigException(ProcessingException):
pass
class NoEndTemplateException(ProcessingException):
pass
class NoStartTemplateException(ProcessingException):
pass
class PagesProcessor:
def __init__(self, url="https://www.wikidata.org/wiki/", cache_client=None):
self.site = pywikibot.Site(url=url)
self.repo = self.site.data_repository()
self.template_name = "Property dashboard"
self.end_template_name = "Property dashboard end"
self.summary = "Update property usage stats"
self.outputs = []
if not cache_client:
host = os.getenv("REDIS_HOST", "tools-redis.svc.eqiad.wmflabs")
cache_client = StrictRedis(host=host, decode_responses=False)
self.cache = RedisCache(cache_client=cache_client)
def make_cache_key(self, page_title):
return ":".join([self.site.code, page_title]).replace(" ", "_")
def get_all_pages(self):
template = pywikibot.Page(self.site, self.template_name, ns=10)
return template.getReferences(only_template_inclusion=True)
@staticmethod
def extract_elements_from_template_param(template_param):
"""Extract and sanitize the contents of a parsed template param."""
(field, _, value) = template_param.partition("=")
return (field.strip(), value.replace("{{!}}", "|"))
def parse_config_from_params(self, params):
return {
key: value
for (key, value) in [
self.extract_elements_from_template_param(param) for param in params
]
if key
}
def make_stats_object_arguments_for_page(self, page):
all_templates_with_params = page.templatesWithParams()
if self.template_name not in [
template.title(with_ns=False) for (template, _) in all_templates_with_params
]:
msg = (
"No start template '%s' found."
"The likely explanation is that inteGraality was invoked from a page that transcludes the page with the template. "
"Please invoke inteGraality directly from the page with the template."
% self.template_name
)
raise NoStartTemplateException(msg)
if self.end_template_name not in [
template.title(with_ns=False) for (template, _) in all_templates_with_params
]:
raise NoEndTemplateException(
"No end template '%s' provided" % self.end_template_name
)
start_templates_with_params = [
(template, params)
for (template, params) in all_templates_with_params
if template.title(with_ns=False) == self.template_name
]
if len(start_templates_with_params) > 1:
pywikibot.warn("More than one template on the page %s" % page.title())
(template, params) = start_templates_with_params[0]
parsed_config = self.parse_config_from_params(params)
config = self.parse_config(parsed_config)
key = self.make_cache_key(page.title())
self.cache.set_cache_value(key, config)
return config
def make_stats_object_for_page(self, page):
config = self.make_stats_object_arguments_for_page(page)
try:
return PropertyStatistics(**config)
except TypeError:
raise ConfigException("The template parameters are incorrect.")
def process_page(self, page):
self.cache.invalidate(self.make_cache_key(page.title()))
stats = self.make_stats_object_for_page(page)
output = stats.retrieve_and_process_data()
new_text = self.replace_in_page(output, page.get())
save_to_wiki_or_local(page, self.summary, new_text)
def parse_config(self, config):
for field in REQUIRED_CONFIG_FIELDS:
if field not in config:
pywikibot.output("Missing required field %s" % field)
raise ConfigException("A required field is missing: %s" % field)
config["columns"] = self.parse_config_properties(config["properties"])
del config["properties"]
try:
config["grouping_configuration"] = GroupingConfigurationMaker.make(
self.repo,
config.pop("grouping_property"),
config.pop("higher_grouping", None),
int(config.pop("grouping_threshold", 20)),
config.pop("grouping_link", None),
)
except UnsupportedGroupingConfigurationException as e:
raise ConfigException(e)
config["stats_for_no_group"] = bool(config.get("stats_for_no_group", False))
+ config["sparql_query_engine"] = SparqlEngineBuilder.make(
+ config.pop("sparql_endpoint", None)
+ )
return config
@staticmethod
def parse_config_properties(properties_string):
properties = [x.strip() for x in properties_string.split(",")]
properties_data = []
for prop in properties:
try:
(key, title) = prop.split(":")
except ValueError:
(key, title) = (prop, None)
if key:
try:
properties_data.append(ColumnMaker.make(key, title))
except ColumnSyntaxException as e:
raise ConfigException(e)
return properties_data
def replace_in_page(self, output, page_text):
regex_text = f"({{{{{self.template_name}.*?(?",
"PREFIX wdt: ",
"PREFIX p: ",
"PREFIX ps: ",
"PREFIX pq: ",
"PREFIX rdfs: ",
"PREFIX schema: ",
"PREFIX bd: ",
"PREFIX wikibase: ",
"PREFIX wdno: ",
]
return "\n".join(prefixes) + "\n" + query
class QLeverSparqlQueryEngine(SparqlQueryEngine):
- def __init__(self):
- self.endpoint = "https://qlever.dev/api/wikidata"
+ def __init__(self, endpoint="https://qlever.dev/api/wikidata"):
+ self.endpoint = endpoint
def select(self, query):
try:
query = add_prefixes_to_query(query)
params = {"query": query}
response = requests.get(self.endpoint, params=params, timeout=30)
response.raise_for_status()
data = response.json()
return self._transform_response(data)
except requests.exceptions.HTTPError as e:
raise QueryException(
"QLever is not available, please try again later.",
query=query,
) from e
except (requests.exceptions.Timeout, requests.exceptions.RequestException):
raise QueryException(
"QLever timed out when running a SPARQL query."
"You might be trying to do something too expensive.",
query=query,
)
def _transform_response(self, data):
"""Transform QLever response to expected format."""
if "results" in data and "bindings" in data["results"]:
result = []
for binding in data["results"]["bindings"]:
row = {}
for var, value in binding.items():
row[var] = value["value"]
result.append(row)
return result
return []
diff --git a/integraality/tests/test_pages_processor.py b/integraality/tests/test_pages_processor.py
index 54563fb..7823a4e 100644
--- a/integraality/tests/test_pages_processor.py
+++ b/integraality/tests/test_pages_processor.py
@@ -1,291 +1,315 @@
# -*- coding: utf-8 -*-
"""Unit tests for functions.py."""
import argparse
import unittest
from unittest.mock import patch
import fakeredis
from integraality.column import DescriptionColumn, LabelColumn, PropertyColumn
from integraality.grouping import ItemGroupingConfiguration
from integraality.pages_processor import ConfigException, PagesProcessor, main
+from sparql_utils import QLeverSparqlQueryEngine, WdqsSparqlQueryEngine
class ProcessortTest(unittest.TestCase):
def setUp(self):
fake_cache_client = fakeredis.FakeStrictRedis()
self.processor = PagesProcessor(cache_client=fake_cache_client)
class TestReplaceInPage(ProcessortTest):
def setUp(self):
self.processor = PagesProcessor()
self.text = """
Head
{{Property dashboard start
|properties=P136:genre,P404
|grouping_property=P400
|stats_for_no_group=1
|selector_sparql=wdt:P31/wdt:P279* wd:Q7889
|target_page_title=Wikidata:WikiProject Video games/Statistics/Platform
|grouping_link=Wikidata::WikiProject Video games/Reports/Platform
}}
foo
{{Property dashboard end}}
Bottom
"""
self.final_text = """
Head
{{Property dashboard start
|properties=P136:genre,P404
|grouping_property=P400
|stats_for_no_group=1
|selector_sparql=wdt:P31/wdt:P279* wd:Q7889
|target_page_title=Wikidata:WikiProject Video games/Statistics/Platform
|grouping_link=Wikidata::WikiProject Video games/Reports/Platform
}}
bar
{{Property dashboard end}}
Bottom
"""
def test_replace_in_page(self):
result = self.processor.replace_in_page("bar", self.text)
self.assertEqual(result, self.final_text)
def test_replace_in_page_escaped_pipe(self):
text = self.text.replace("wd:Q7889", "{{!}}")
final_text = self.final_text.replace("wd:Q7889", "{{!}}")
result = self.processor.replace_in_page("bar", text)
self.assertEqual(result, final_text)
class TestParseConfig(ProcessortTest):
def setUp(self):
self.processor = PagesProcessor()
def test_normal_config(self):
input_config = {
"grouping_link": "Wikidata:WikiProject Video games/Reports/Platform",
"grouping_property": "P400",
"stats_for_no_group": "1",
"properties": "P136:genre,P404",
"selector_sparql": "wdt:P31/wdt:P279* wd:Q7889",
}
result = self.processor.parse_config(input_config)
expected = {
"grouping_configuration": ItemGroupingConfiguration(
property="P400",
base_grouping_link="Wikidata:WikiProject Video games/Reports/Platform",
),
"stats_for_no_group": True,
"columns": [
PropertyColumn(property="P136", title="genre"),
PropertyColumn(property="P404"),
],
"selector_sparql": "wdt:P31/wdt:P279* wd:Q7889",
}
+ self.assertIsInstance(result.pop("sparql_query_engine"), WdqsSparqlQueryEngine)
self.assertEqual(result, expected)
def test_minimal_config(self):
input_config = {
"selector_sparql": "wdt:P31/wdt:P279* wd:Q7889",
"grouping_property": "P400",
"properties": "P136:genre,P404",
}
result = self.processor.parse_config(input_config)
expected = {
"selector_sparql": "wdt:P31/wdt:P279* wd:Q7889",
"grouping_configuration": ItemGroupingConfiguration(property="P400"),
"columns": [
PropertyColumn(property="P136", title="genre"),
PropertyColumn(property="P404"),
],
"stats_for_no_group": False,
}
+ self.assertIsInstance(result.pop("sparql_query_engine"), WdqsSparqlQueryEngine)
self.assertEqual(result, expected)
def test_full_config(self):
input_config = {
"grouping_property": "P400",
"stats_for_no_group": "1",
"properties": "P136:genre,P404",
"selector_sparql": "wdt:P31/wdt:P279* wd:Q7889",
"grouping_threshold": "1",
"property_threshold": "2",
}
result = self.processor.parse_config(input_config)
expected = {
"selector_sparql": "wdt:P31/wdt:P279* wd:Q7889",
"grouping_configuration": ItemGroupingConfiguration(
property="P400", grouping_threshold=1
),
"columns": [
PropertyColumn(property="P136", title="genre"),
PropertyColumn(property="P404"),
],
"stats_for_no_group": True,
"property_threshold": "2",
}
+ self.assertIsInstance(result.pop("sparql_query_engine"), WdqsSparqlQueryEngine)
self.assertEqual(result, expected)
def test_empty_config(self):
input_config = {}
with self.assertRaises(ConfigException):
self.processor.parse_config(input_config)
def test_insufficient_config(self):
input_config = {
"selector_sparql": "wdt:P31/wdt:P279* wd:Q7889",
}
with self.assertRaises(ConfigException):
self.processor.parse_config(input_config)
+ def test_config_with_qlever_endpoint(self):
+ input_config = {
+ "selector_sparql": "wdt:P31/wdt:P279* wd:Q7889",
+ "grouping_property": "P400",
+ "properties": "P136:genre,P404",
+ "sparql_endpoint": "https://qlever.dev/wikidata/",
+ }
+ result = self.processor.parse_config(input_config)
+ self.assertIsInstance(result["sparql_query_engine"], QLeverSparqlQueryEngine)
+
+ def test_config_with_wdqs_endpoint(self):
+ input_config = {
+ "selector_sparql": "wdt:P31/wdt:P279* wd:Q7889",
+ "grouping_property": "P400",
+ "properties": "P136:genre,P404",
+ "sparql_endpoint": "query.wikidata.org",
+ }
+ result = self.processor.parse_config(input_config)
+ self.assertIsInstance(result["sparql_query_engine"], WdqsSparqlQueryEngine)
+
class TestParseParams(ProcessortTest):
def test_parse_config_from_params_minimal(self):
params = [
"grouping_property=P195",
"properties=P170:creator,P276",
"selector_sparql=wdt:P31 wd:Q3305213",
]
expected = {
"grouping_property": "P195",
"properties": "P170:creator,P276",
"selector_sparql": "wdt:P31 wd:Q3305213",
}
result = self.processor.parse_config_from_params(params)
self.assertEqual(result, expected)
def test_parse_config_from_params_with_empty_param(self):
params = [
"",
"grouping_property=P195",
"properties=P170:creator,P276",
"selector_sparql=wdt:P31 wd:Q3305213",
]
expected = {
"grouping_property": "P195",
"properties": "P170:creator,P276",
"selector_sparql": "wdt:P31 wd:Q3305213",
}
result = self.processor.parse_config_from_params(params)
self.assertEqual(result, expected)
def test_parse_config_from_params_with_escaped_pipe(self):
params = [
"grouping_property=P195",
"properties=P170:creator,P276",
'selector_sparql=REGEX(?id, "^(a{{!}}b)")',
]
expected = {
"grouping_property": "P195",
"properties": "P170:creator,P276",
"selector_sparql": 'REGEX(?id, "^(a|b)")',
}
result = self.processor.parse_config_from_params(params)
self.assertEqual(result, expected)
class TestParseConfigProperties(ProcessortTest):
def test(self):
properties = "P136:genre,P404"
result = self.processor.parse_config_properties(properties)
expected = [
PropertyColumn(property="P136", title="genre"),
PropertyColumn(property="P404"),
]
self.assertEqual(result, expected)
def test_with_trail_comma(self):
properties = "P136:genre,P404,"
result = self.processor.parse_config_properties(properties)
expected = [
PropertyColumn(property="P136", title="genre"),
PropertyColumn(property="P404"),
]
self.assertEqual(result, expected)
def test_more_properties(self):
properties = "P136,P178,P123,P495,P577,P404,P437"
result = self.processor.parse_config_properties(properties)
expected = [
PropertyColumn(property="P136"),
PropertyColumn(property="P178"),
PropertyColumn(property="P123"),
PropertyColumn(property="P495"),
PropertyColumn(property="P577"),
PropertyColumn(property="P404"),
PropertyColumn(property="P437"),
]
self.assertEqual(result, expected)
def test_with_qualifier(self):
properties = "P136:genre,P404,P669/P670"
result = self.processor.parse_config_properties(properties)
expected = [
PropertyColumn(property="P136", title="genre"),
PropertyColumn(property="P404"),
PropertyColumn(property="P669", qualifier="P670"),
]
self.assertEqual(result, expected)
def test_with_qualifier_and_value(self):
properties = "P136:genre,P404,P553/Q17459/P670"
result = self.processor.parse_config_properties(properties)
expected = [
PropertyColumn(property="P136", title="genre"),
PropertyColumn(property="P404"),
PropertyColumn(property="P553", value="Q17459", qualifier="P670"),
]
self.assertEqual(result, expected)
def test_with_label(self):
properties = "P136:genre,Lbr,P553"
result = self.processor.parse_config_properties(properties)
expected = [
PropertyColumn(property="P136", title="genre"),
LabelColumn(language="br"),
PropertyColumn(property="P553"),
]
self.assertEqual(result, expected)
def test_with_description(self):
properties = "P136:genre,Lxy,P553"
result = self.processor.parse_config_properties(properties)
expected = [
PropertyColumn(property="P136", title="genre"),
DescriptionColumn(language="xy"),
PropertyColumn(property="P553"),
]
self.assertEqual(result, expected)
def test_with_space(self):
properties = "P131, P17"
result = self.processor.parse_config_properties(properties)
expected = [PropertyColumn(property="P131"), PropertyColumn(property="P17")]
self.assertEqual(result, expected)
def test_with_incorrect_syntax(self):
properties = "P131,Something"
with self.assertRaises(ConfigException):
self.processor.parse_config_properties(properties)
class TestMain(unittest.TestCase):
def setUp(self):
patcher1 = patch("integraality.pages_processor.PagesProcessor", autospec=True)
self.mock_pages_processor = patcher1.start()
self.addCleanup(patcher1.stop)
patcher2 = patch("argparse.ArgumentParser.parse_args", autospec=True)
self.mock_args = patcher2.start()
self.addCleanup(patcher2.stop)
def test_main_url_argument(self):
url = "Foo"
self.mock_args.return_value = argparse.Namespace(url=url)
main()
self.mock_pages_processor.assert_called_once_with(url)
self.mock_pages_processor.return_value.process_all.assert_called_once_with()
diff --git a/integraality/tests/test_sparql_utils.py b/integraality/tests/test_sparql_utils.py
index 5f152dd..4313d5c 100644
--- a/integraality/tests/test_sparql_utils.py
+++ b/integraality/tests/test_sparql_utils.py
@@ -1,171 +1,197 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import unittest
from unittest.mock import Mock, patch
import pywikibot
import requests
from sparql_utils import (
add_prefixes_to_query,
QLeverSparqlQueryEngine,
QueryException,
+ SparqlEngineBuilder,
+ UnsupportedSparqlEngineException,
WdqsSparqlQueryEngine,
)
class WdqsSparqlQueryEngineTest(unittest.TestCase):
@patch("sparql_utils.pywikibot.data.sparql.SparqlQuery")
def test_select(self, mock_sparql_query_class):
mock_sq = Mock()
mock_sq.select.return_value = [{"count": "42"}]
mock_sparql_query_class.return_value = mock_sq
engine = WdqsSparqlQueryEngine()
result = engine.select("SELECT (COUNT(*) as ?count) WHERE { ?s ?p ?o }")
expected = [{"count": "42"}]
self.assertEqual(result, expected)
mock_sq.select.assert_called_once_with(
"SELECT (COUNT(*) as ?count) WHERE { ?s ?p ?o }"
)
@patch("sparql_utils.pywikibot.data.sparql.SparqlQuery")
def test_select_timeout_error(self, mock_sparql_query_class):
mock_sq = Mock()
mock_sq.select.side_effect = pywikibot.exceptions.TimeoutError("Timeout")
mock_sparql_query_class.return_value = mock_sq
engine = WdqsSparqlQueryEngine()
with self.assertRaises(QueryException) as cm:
engine.select("SELECT * WHERE { ?s ?p ?o }")
self.assertIn(
"The Wikidata Query Service timed out when running a SPARQL query",
str(cm.exception),
)
self.assertEqual(cm.exception.query, "SELECT * WHERE { ?s ?p ?o }")
class QLeverSparqlQueryEngineTest(unittest.TestCase):
def setUp(self):
self.engine = QLeverSparqlQueryEngine()
@patch("requests.get")
def test_select_success(self, mock_get):
mock_response = Mock()
mock_response.json.return_value = {
"results": {
"bindings": [
{"entity": {"value": "http://www.wikidata.org/entity/Q1"}},
{"entity": {"value": "http://www.wikidata.org/entity/Q2"}},
]
}
}
mock_get.return_value = mock_response
result = self.engine.select("SELECT ?entity WHERE { ?entity wdt:P31 wd:Q5 }")
expected = [
{"entity": "http://www.wikidata.org/entity/Q1"},
{"entity": "http://www.wikidata.org/entity/Q2"},
]
self.assertEqual(result, expected)
@patch("requests.get")
def test_select_timeout_error(self, mock_get):
mock_get.side_effect = requests.exceptions.Timeout("Request timed out")
with self.assertRaises(QueryException) as cm:
self.engine.select("SELECT ?entity WHERE { ?entity wdt:P31 wd:Q5 }")
self.assertIn("QLever timed out", str(cm.exception))
self.assertIsNotNone(cm.exception.query)
@patch("requests.get")
def test_select_503(self, mock_get):
mock_get.side_effect = requests.exceptions.HTTPError()
with self.assertRaises(QueryException) as cm:
self.engine.select("SELECT ?entity WHERE { ?entity wdt:P31 wd:Q5 }")
self.assertIn("QLever is not available", str(cm.exception))
self.assertIsNotNone(cm.exception.query)
def test_transform_response_valid(self):
data = {
"results": {
"bindings": [
{"entity": {"value": "http://www.wikidata.org/entity/Q1"}},
{"count": {"value": "42"}},
]
}
}
result = self.engine._transform_response(data)
expected = [{"entity": "http://www.wikidata.org/entity/Q1"}, {"count": "42"}]
self.assertEqual(result, expected)
def test_transform_response_empty(self):
empty_data = {}
result = self.engine._transform_response(empty_data)
self.assertEqual(result, [])
def test_transform_response_grouping_query(self):
# Test with actual QLever API response format
grouping_data = {
"results": {
"bindings": [
{
"grouping": {
"type": "uri",
"value": "http://www.wikidata.org/entity/Q2047427",
},
"higher_grouping": {"type": "literal", "value": "CHN"},
"grouping_link_value": {
"type": "literal",
"value": "The Palace Museum",
"xml:lang": "en",
},
"count": {
"datatype": "http://www.w3.org/2001/XMLSchema#int",
"type": "literal",
"value": "46762",
},
},
{
"grouping": {
"type": "uri",
"value": "http://www.wikidata.org/entity/Q812285",
},
"count": {
"datatype": "http://www.w3.org/2001/XMLSchema#int",
"type": "literal",
"value": "18009",
},
},
]
}
}
result = self.engine._transform_response(grouping_data)
expected = [
{
"grouping": "http://www.wikidata.org/entity/Q2047427",
"higher_grouping": "CHN",
"grouping_link_value": "The Palace Museum",
"count": "46762",
},
{"grouping": "http://www.wikidata.org/entity/Q812285", "count": "18009"},
]
self.assertEqual(result, expected)
class AddPrefixesToQueryTest(unittest.TestCase):
def test_add_prefixes_to_query(self):
query = "SELECT ?item WHERE { ?item wdt:P31 wd:Q5 }"
result = add_prefixes_to_query(query)
self.assertIn("PREFIX wd: ", result)
self.assertIn("PREFIX wdt: ", result)
self.assertIn(query, result)
self.assertTrue(result.endswith(query))
+
+
+class SparqlEngineBuilderTest(unittest.TestCase):
+ def test_create_qlever_engine_url(self):
+ engine = SparqlEngineBuilder.make("https://qlever.dev/api/wikidata")
+ self.assertIsInstance(engine, QLeverSparqlQueryEngine)
+ self.assertEqual(engine.endpoint, "https://qlever.dev/api/wikidata")
+
+ def test_create_qlever_engine_name(self):
+ engine = SparqlEngineBuilder.make("qlever")
+ self.assertIsInstance(engine, QLeverSparqlQueryEngine)
+ self.assertEqual(engine.endpoint, "https://qlever.dev/api/wikidata")
+
+ def test_create_wdqs_engine_wdqs(self):
+ engine = SparqlEngineBuilder.make("query.wikidata.org")
+ self.assertIsInstance(engine, WdqsSparqlQueryEngine)
+
+ def test_create_wdqs_engine_default(self):
+ engine = SparqlEngineBuilder.make()
+ self.assertIsInstance(engine, WdqsSparqlQueryEngine)
+
+ def test_create_wdqs_engine_unsupported(self):
+ with self.assertRaises(UnsupportedSparqlEngineException):
+ SparqlEngineBuilder.make("foo")