diff --git a/integraality/pages_processor.py b/integraality/pages_processor.py index 0552d5e..9a41f04 100644 --- a/integraality/pages_processor.py +++ b/integraality/pages_processor.py @@ -1,239 +1,242 @@ #!/usr/bin/python # -*- coding: utf-8 -*- """ Bot to generate statistics """ import os import re from redis import StrictRedis import pywikibot from cache import RedisCache from column import ColumnMaker, ColumnSyntaxException from grouping import ( GroupingConfigurationMaker, UnsupportedGroupingConfigurationException, ) from page_saving import save_to_wiki_or_local from property_statistics import PropertyStatistics -from sparql_utils import QueryException +from sparql_utils import QueryException, SparqlEngineBuilder REQUIRED_CONFIG_FIELDS = ["selector_sparql", "grouping_property", "properties"] class ProcessingException(Exception): pass class ConfigException(ProcessingException): pass class NoEndTemplateException(ProcessingException): pass class NoStartTemplateException(ProcessingException): pass class PagesProcessor: def __init__(self, url="https://www.wikidata.org/wiki/", cache_client=None): self.site = pywikibot.Site(url=url) self.repo = self.site.data_repository() self.template_name = "Property dashboard" self.end_template_name = "Property dashboard end" self.summary = "Update property usage stats" self.outputs = [] if not cache_client: host = os.getenv("REDIS_HOST", "tools-redis.svc.eqiad.wmflabs") cache_client = StrictRedis(host=host, decode_responses=False) self.cache = RedisCache(cache_client=cache_client) def make_cache_key(self, page_title): return ":".join([self.site.code, page_title]).replace(" ", "_") def get_all_pages(self): template = pywikibot.Page(self.site, self.template_name, ns=10) return template.getReferences(only_template_inclusion=True) @staticmethod def extract_elements_from_template_param(template_param): """Extract and sanitize the contents of a parsed template param.""" (field, _, value) = template_param.partition("=") return (field.strip(), value.replace("{{!}}", "|")) def parse_config_from_params(self, params): return { key: value for (key, value) in [ self.extract_elements_from_template_param(param) for param in params ] if key } def make_stats_object_arguments_for_page(self, page): all_templates_with_params = page.templatesWithParams() if self.template_name not in [ template.title(with_ns=False) for (template, _) in all_templates_with_params ]: msg = ( "No start template '%s' found." "The likely explanation is that inteGraality was invoked from a page that transcludes the page with the template. " "Please invoke inteGraality directly from the page with the template." % self.template_name ) raise NoStartTemplateException(msg) if self.end_template_name not in [ template.title(with_ns=False) for (template, _) in all_templates_with_params ]: raise NoEndTemplateException( "No end template '%s' provided" % self.end_template_name ) start_templates_with_params = [ (template, params) for (template, params) in all_templates_with_params if template.title(with_ns=False) == self.template_name ] if len(start_templates_with_params) > 1: pywikibot.warn("More than one template on the page %s" % page.title()) (template, params) = start_templates_with_params[0] parsed_config = self.parse_config_from_params(params) config = self.parse_config(parsed_config) key = self.make_cache_key(page.title()) self.cache.set_cache_value(key, config) return config def make_stats_object_for_page(self, page): config = self.make_stats_object_arguments_for_page(page) try: return PropertyStatistics(**config) except TypeError: raise ConfigException("The template parameters are incorrect.") def process_page(self, page): self.cache.invalidate(self.make_cache_key(page.title())) stats = self.make_stats_object_for_page(page) output = stats.retrieve_and_process_data() new_text = self.replace_in_page(output, page.get()) save_to_wiki_or_local(page, self.summary, new_text) def parse_config(self, config): for field in REQUIRED_CONFIG_FIELDS: if field not in config: pywikibot.output("Missing required field %s" % field) raise ConfigException("A required field is missing: %s" % field) config["columns"] = self.parse_config_properties(config["properties"]) del config["properties"] try: config["grouping_configuration"] = GroupingConfigurationMaker.make( self.repo, config.pop("grouping_property"), config.pop("higher_grouping", None), int(config.pop("grouping_threshold", 20)), config.pop("grouping_link", None), ) except UnsupportedGroupingConfigurationException as e: raise ConfigException(e) config["stats_for_no_group"] = bool(config.get("stats_for_no_group", False)) + config["sparql_query_engine"] = SparqlEngineBuilder.make( + config.pop("sparql_endpoint", None) + ) return config @staticmethod def parse_config_properties(properties_string): properties = [x.strip() for x in properties_string.split(",")] properties_data = [] for prop in properties: try: (key, title) = prop.split(":") except ValueError: (key, title) = (prop, None) if key: try: properties_data.append(ColumnMaker.make(key, title)) except ColumnSyntaxException as e: raise ConfigException(e) return properties_data def replace_in_page(self, output, page_text): regex_text = f"({{{{{self.template_name}.*?(?", "PREFIX wdt: ", "PREFIX p: ", "PREFIX ps: ", "PREFIX pq: ", "PREFIX rdfs: ", "PREFIX schema: ", "PREFIX bd: ", "PREFIX wikibase: ", "PREFIX wdno: ", ] return "\n".join(prefixes) + "\n" + query class QLeverSparqlQueryEngine(SparqlQueryEngine): - def __init__(self): - self.endpoint = "https://qlever.dev/api/wikidata" + def __init__(self, endpoint="https://qlever.dev/api/wikidata"): + self.endpoint = endpoint def select(self, query): try: query = add_prefixes_to_query(query) params = {"query": query} response = requests.get(self.endpoint, params=params, timeout=30) response.raise_for_status() data = response.json() return self._transform_response(data) except requests.exceptions.HTTPError as e: raise QueryException( "QLever is not available, please try again later.", query=query, ) from e except (requests.exceptions.Timeout, requests.exceptions.RequestException): raise QueryException( "QLever timed out when running a SPARQL query." "You might be trying to do something too expensive.", query=query, ) def _transform_response(self, data): """Transform QLever response to expected format.""" if "results" in data and "bindings" in data["results"]: result = [] for binding in data["results"]["bindings"]: row = {} for var, value in binding.items(): row[var] = value["value"] result.append(row) return result return [] diff --git a/integraality/tests/test_pages_processor.py b/integraality/tests/test_pages_processor.py index 54563fb..7823a4e 100644 --- a/integraality/tests/test_pages_processor.py +++ b/integraality/tests/test_pages_processor.py @@ -1,291 +1,315 @@ # -*- coding: utf-8 -*- """Unit tests for functions.py.""" import argparse import unittest from unittest.mock import patch import fakeredis from integraality.column import DescriptionColumn, LabelColumn, PropertyColumn from integraality.grouping import ItemGroupingConfiguration from integraality.pages_processor import ConfigException, PagesProcessor, main +from sparql_utils import QLeverSparqlQueryEngine, WdqsSparqlQueryEngine class ProcessortTest(unittest.TestCase): def setUp(self): fake_cache_client = fakeredis.FakeStrictRedis() self.processor = PagesProcessor(cache_client=fake_cache_client) class TestReplaceInPage(ProcessortTest): def setUp(self): self.processor = PagesProcessor() self.text = """ Head {{Property dashboard start |properties=P136:genre,P404 |grouping_property=P400 |stats_for_no_group=1 |selector_sparql=wdt:P31/wdt:P279* wd:Q7889 |target_page_title=Wikidata:WikiProject Video games/Statistics/Platform |grouping_link=Wikidata::WikiProject Video games/Reports/Platform }} foo {{Property dashboard end}} Bottom """ self.final_text = """ Head {{Property dashboard start |properties=P136:genre,P404 |grouping_property=P400 |stats_for_no_group=1 |selector_sparql=wdt:P31/wdt:P279* wd:Q7889 |target_page_title=Wikidata:WikiProject Video games/Statistics/Platform |grouping_link=Wikidata::WikiProject Video games/Reports/Platform }} bar {{Property dashboard end}} Bottom """ def test_replace_in_page(self): result = self.processor.replace_in_page("bar", self.text) self.assertEqual(result, self.final_text) def test_replace_in_page_escaped_pipe(self): text = self.text.replace("wd:Q7889", "{{!}}") final_text = self.final_text.replace("wd:Q7889", "{{!}}") result = self.processor.replace_in_page("bar", text) self.assertEqual(result, final_text) class TestParseConfig(ProcessortTest): def setUp(self): self.processor = PagesProcessor() def test_normal_config(self): input_config = { "grouping_link": "Wikidata:WikiProject Video games/Reports/Platform", "grouping_property": "P400", "stats_for_no_group": "1", "properties": "P136:genre,P404", "selector_sparql": "wdt:P31/wdt:P279* wd:Q7889", } result = self.processor.parse_config(input_config) expected = { "grouping_configuration": ItemGroupingConfiguration( property="P400", base_grouping_link="Wikidata:WikiProject Video games/Reports/Platform", ), "stats_for_no_group": True, "columns": [ PropertyColumn(property="P136", title="genre"), PropertyColumn(property="P404"), ], "selector_sparql": "wdt:P31/wdt:P279* wd:Q7889", } + self.assertIsInstance(result.pop("sparql_query_engine"), WdqsSparqlQueryEngine) self.assertEqual(result, expected) def test_minimal_config(self): input_config = { "selector_sparql": "wdt:P31/wdt:P279* wd:Q7889", "grouping_property": "P400", "properties": "P136:genre,P404", } result = self.processor.parse_config(input_config) expected = { "selector_sparql": "wdt:P31/wdt:P279* wd:Q7889", "grouping_configuration": ItemGroupingConfiguration(property="P400"), "columns": [ PropertyColumn(property="P136", title="genre"), PropertyColumn(property="P404"), ], "stats_for_no_group": False, } + self.assertIsInstance(result.pop("sparql_query_engine"), WdqsSparqlQueryEngine) self.assertEqual(result, expected) def test_full_config(self): input_config = { "grouping_property": "P400", "stats_for_no_group": "1", "properties": "P136:genre,P404", "selector_sparql": "wdt:P31/wdt:P279* wd:Q7889", "grouping_threshold": "1", "property_threshold": "2", } result = self.processor.parse_config(input_config) expected = { "selector_sparql": "wdt:P31/wdt:P279* wd:Q7889", "grouping_configuration": ItemGroupingConfiguration( property="P400", grouping_threshold=1 ), "columns": [ PropertyColumn(property="P136", title="genre"), PropertyColumn(property="P404"), ], "stats_for_no_group": True, "property_threshold": "2", } + self.assertIsInstance(result.pop("sparql_query_engine"), WdqsSparqlQueryEngine) self.assertEqual(result, expected) def test_empty_config(self): input_config = {} with self.assertRaises(ConfigException): self.processor.parse_config(input_config) def test_insufficient_config(self): input_config = { "selector_sparql": "wdt:P31/wdt:P279* wd:Q7889", } with self.assertRaises(ConfigException): self.processor.parse_config(input_config) + def test_config_with_qlever_endpoint(self): + input_config = { + "selector_sparql": "wdt:P31/wdt:P279* wd:Q7889", + "grouping_property": "P400", + "properties": "P136:genre,P404", + "sparql_endpoint": "https://qlever.dev/wikidata/", + } + result = self.processor.parse_config(input_config) + self.assertIsInstance(result["sparql_query_engine"], QLeverSparqlQueryEngine) + + def test_config_with_wdqs_endpoint(self): + input_config = { + "selector_sparql": "wdt:P31/wdt:P279* wd:Q7889", + "grouping_property": "P400", + "properties": "P136:genre,P404", + "sparql_endpoint": "query.wikidata.org", + } + result = self.processor.parse_config(input_config) + self.assertIsInstance(result["sparql_query_engine"], WdqsSparqlQueryEngine) + class TestParseParams(ProcessortTest): def test_parse_config_from_params_minimal(self): params = [ "grouping_property=P195", "properties=P170:creator,P276", "selector_sparql=wdt:P31 wd:Q3305213", ] expected = { "grouping_property": "P195", "properties": "P170:creator,P276", "selector_sparql": "wdt:P31 wd:Q3305213", } result = self.processor.parse_config_from_params(params) self.assertEqual(result, expected) def test_parse_config_from_params_with_empty_param(self): params = [ "", "grouping_property=P195", "properties=P170:creator,P276", "selector_sparql=wdt:P31 wd:Q3305213", ] expected = { "grouping_property": "P195", "properties": "P170:creator,P276", "selector_sparql": "wdt:P31 wd:Q3305213", } result = self.processor.parse_config_from_params(params) self.assertEqual(result, expected) def test_parse_config_from_params_with_escaped_pipe(self): params = [ "grouping_property=P195", "properties=P170:creator,P276", 'selector_sparql=REGEX(?id, "^(a{{!}}b)")', ] expected = { "grouping_property": "P195", "properties": "P170:creator,P276", "selector_sparql": 'REGEX(?id, "^(a|b)")', } result = self.processor.parse_config_from_params(params) self.assertEqual(result, expected) class TestParseConfigProperties(ProcessortTest): def test(self): properties = "P136:genre,P404" result = self.processor.parse_config_properties(properties) expected = [ PropertyColumn(property="P136", title="genre"), PropertyColumn(property="P404"), ] self.assertEqual(result, expected) def test_with_trail_comma(self): properties = "P136:genre,P404," result = self.processor.parse_config_properties(properties) expected = [ PropertyColumn(property="P136", title="genre"), PropertyColumn(property="P404"), ] self.assertEqual(result, expected) def test_more_properties(self): properties = "P136,P178,P123,P495,P577,P404,P437" result = self.processor.parse_config_properties(properties) expected = [ PropertyColumn(property="P136"), PropertyColumn(property="P178"), PropertyColumn(property="P123"), PropertyColumn(property="P495"), PropertyColumn(property="P577"), PropertyColumn(property="P404"), PropertyColumn(property="P437"), ] self.assertEqual(result, expected) def test_with_qualifier(self): properties = "P136:genre,P404,P669/P670" result = self.processor.parse_config_properties(properties) expected = [ PropertyColumn(property="P136", title="genre"), PropertyColumn(property="P404"), PropertyColumn(property="P669", qualifier="P670"), ] self.assertEqual(result, expected) def test_with_qualifier_and_value(self): properties = "P136:genre,P404,P553/Q17459/P670" result = self.processor.parse_config_properties(properties) expected = [ PropertyColumn(property="P136", title="genre"), PropertyColumn(property="P404"), PropertyColumn(property="P553", value="Q17459", qualifier="P670"), ] self.assertEqual(result, expected) def test_with_label(self): properties = "P136:genre,Lbr,P553" result = self.processor.parse_config_properties(properties) expected = [ PropertyColumn(property="P136", title="genre"), LabelColumn(language="br"), PropertyColumn(property="P553"), ] self.assertEqual(result, expected) def test_with_description(self): properties = "P136:genre,Lxy,P553" result = self.processor.parse_config_properties(properties) expected = [ PropertyColumn(property="P136", title="genre"), DescriptionColumn(language="xy"), PropertyColumn(property="P553"), ] self.assertEqual(result, expected) def test_with_space(self): properties = "P131, P17" result = self.processor.parse_config_properties(properties) expected = [PropertyColumn(property="P131"), PropertyColumn(property="P17")] self.assertEqual(result, expected) def test_with_incorrect_syntax(self): properties = "P131,Something" with self.assertRaises(ConfigException): self.processor.parse_config_properties(properties) class TestMain(unittest.TestCase): def setUp(self): patcher1 = patch("integraality.pages_processor.PagesProcessor", autospec=True) self.mock_pages_processor = patcher1.start() self.addCleanup(patcher1.stop) patcher2 = patch("argparse.ArgumentParser.parse_args", autospec=True) self.mock_args = patcher2.start() self.addCleanup(patcher2.stop) def test_main_url_argument(self): url = "Foo" self.mock_args.return_value = argparse.Namespace(url=url) main() self.mock_pages_processor.assert_called_once_with(url) self.mock_pages_processor.return_value.process_all.assert_called_once_with() diff --git a/integraality/tests/test_sparql_utils.py b/integraality/tests/test_sparql_utils.py index 5f152dd..4313d5c 100644 --- a/integraality/tests/test_sparql_utils.py +++ b/integraality/tests/test_sparql_utils.py @@ -1,171 +1,197 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- import unittest from unittest.mock import Mock, patch import pywikibot import requests from sparql_utils import ( add_prefixes_to_query, QLeverSparqlQueryEngine, QueryException, + SparqlEngineBuilder, + UnsupportedSparqlEngineException, WdqsSparqlQueryEngine, ) class WdqsSparqlQueryEngineTest(unittest.TestCase): @patch("sparql_utils.pywikibot.data.sparql.SparqlQuery") def test_select(self, mock_sparql_query_class): mock_sq = Mock() mock_sq.select.return_value = [{"count": "42"}] mock_sparql_query_class.return_value = mock_sq engine = WdqsSparqlQueryEngine() result = engine.select("SELECT (COUNT(*) as ?count) WHERE { ?s ?p ?o }") expected = [{"count": "42"}] self.assertEqual(result, expected) mock_sq.select.assert_called_once_with( "SELECT (COUNT(*) as ?count) WHERE { ?s ?p ?o }" ) @patch("sparql_utils.pywikibot.data.sparql.SparqlQuery") def test_select_timeout_error(self, mock_sparql_query_class): mock_sq = Mock() mock_sq.select.side_effect = pywikibot.exceptions.TimeoutError("Timeout") mock_sparql_query_class.return_value = mock_sq engine = WdqsSparqlQueryEngine() with self.assertRaises(QueryException) as cm: engine.select("SELECT * WHERE { ?s ?p ?o }") self.assertIn( "The Wikidata Query Service timed out when running a SPARQL query", str(cm.exception), ) self.assertEqual(cm.exception.query, "SELECT * WHERE { ?s ?p ?o }") class QLeverSparqlQueryEngineTest(unittest.TestCase): def setUp(self): self.engine = QLeverSparqlQueryEngine() @patch("requests.get") def test_select_success(self, mock_get): mock_response = Mock() mock_response.json.return_value = { "results": { "bindings": [ {"entity": {"value": "http://www.wikidata.org/entity/Q1"}}, {"entity": {"value": "http://www.wikidata.org/entity/Q2"}}, ] } } mock_get.return_value = mock_response result = self.engine.select("SELECT ?entity WHERE { ?entity wdt:P31 wd:Q5 }") expected = [ {"entity": "http://www.wikidata.org/entity/Q1"}, {"entity": "http://www.wikidata.org/entity/Q2"}, ] self.assertEqual(result, expected) @patch("requests.get") def test_select_timeout_error(self, mock_get): mock_get.side_effect = requests.exceptions.Timeout("Request timed out") with self.assertRaises(QueryException) as cm: self.engine.select("SELECT ?entity WHERE { ?entity wdt:P31 wd:Q5 }") self.assertIn("QLever timed out", str(cm.exception)) self.assertIsNotNone(cm.exception.query) @patch("requests.get") def test_select_503(self, mock_get): mock_get.side_effect = requests.exceptions.HTTPError() with self.assertRaises(QueryException) as cm: self.engine.select("SELECT ?entity WHERE { ?entity wdt:P31 wd:Q5 }") self.assertIn("QLever is not available", str(cm.exception)) self.assertIsNotNone(cm.exception.query) def test_transform_response_valid(self): data = { "results": { "bindings": [ {"entity": {"value": "http://www.wikidata.org/entity/Q1"}}, {"count": {"value": "42"}}, ] } } result = self.engine._transform_response(data) expected = [{"entity": "http://www.wikidata.org/entity/Q1"}, {"count": "42"}] self.assertEqual(result, expected) def test_transform_response_empty(self): empty_data = {} result = self.engine._transform_response(empty_data) self.assertEqual(result, []) def test_transform_response_grouping_query(self): # Test with actual QLever API response format grouping_data = { "results": { "bindings": [ { "grouping": { "type": "uri", "value": "http://www.wikidata.org/entity/Q2047427", }, "higher_grouping": {"type": "literal", "value": "CHN"}, "grouping_link_value": { "type": "literal", "value": "The Palace Museum", "xml:lang": "en", }, "count": { "datatype": "http://www.w3.org/2001/XMLSchema#int", "type": "literal", "value": "46762", }, }, { "grouping": { "type": "uri", "value": "http://www.wikidata.org/entity/Q812285", }, "count": { "datatype": "http://www.w3.org/2001/XMLSchema#int", "type": "literal", "value": "18009", }, }, ] } } result = self.engine._transform_response(grouping_data) expected = [ { "grouping": "http://www.wikidata.org/entity/Q2047427", "higher_grouping": "CHN", "grouping_link_value": "The Palace Museum", "count": "46762", }, {"grouping": "http://www.wikidata.org/entity/Q812285", "count": "18009"}, ] self.assertEqual(result, expected) class AddPrefixesToQueryTest(unittest.TestCase): def test_add_prefixes_to_query(self): query = "SELECT ?item WHERE { ?item wdt:P31 wd:Q5 }" result = add_prefixes_to_query(query) self.assertIn("PREFIX wd: ", result) self.assertIn("PREFIX wdt: ", result) self.assertIn(query, result) self.assertTrue(result.endswith(query)) + + +class SparqlEngineBuilderTest(unittest.TestCase): + def test_create_qlever_engine_url(self): + engine = SparqlEngineBuilder.make("https://qlever.dev/api/wikidata") + self.assertIsInstance(engine, QLeverSparqlQueryEngine) + self.assertEqual(engine.endpoint, "https://qlever.dev/api/wikidata") + + def test_create_qlever_engine_name(self): + engine = SparqlEngineBuilder.make("qlever") + self.assertIsInstance(engine, QLeverSparqlQueryEngine) + self.assertEqual(engine.endpoint, "https://qlever.dev/api/wikidata") + + def test_create_wdqs_engine_wdqs(self): + engine = SparqlEngineBuilder.make("query.wikidata.org") + self.assertIsInstance(engine, WdqsSparqlQueryEngine) + + def test_create_wdqs_engine_default(self): + engine = SparqlEngineBuilder.make() + self.assertIsInstance(engine, WdqsSparqlQueryEngine) + + def test_create_wdqs_engine_unsupported(self): + with self.assertRaises(UnsupportedSparqlEngineException): + SparqlEngineBuilder.make("foo")