Using groups of choices in triples_choices (#43)

mwatts15 · mwatts15 · commit 84b8254dc722 · 2019-09-07T21:33:09.000-05:00
- SQLite only allows a certain number of terms in a WHERE clause. This
  change allows for calls to addN to always come under that limit.
- Also, nenaming _remove_context param to match actual usage
- Improving test for _remove_context(), and adding tests for
  triples_choices(), and contexts()
diff --git a/rdflib_sqlalchemy/store.py b/rdflib_sqlalchemy/store.py
@@ -48,6 +48,25 @@
 Any = None
 
 
+def grouper(iterable, n):
+    "Collect data into chunks of at most n elements"
+    assert n > 0, 'Cannot group into chunks of zero elements'
+    lst = []
+    iterable = iter(iterable)
+    while True:
+        try:
+            lst.append(next(iterable))
+        except StopIteration:
+            break
+
+        if len(lst) == n:
+            yield lst
+            lst = []
+
+    if lst:
+        yield lst
+
+
 def generate_interned_id(identifier):
     return "{prefix}{identifier_hash}".format(
         prefix=INTERNED_PREFIX,
@@ -78,7 +97,8 @@ class SQLAlchemy(Store, SQLGeneratorMixin, StatisticsMixin):
     regex_matching = PYTHON_REGEX
     configuration = Literal("sqlite://")
 
-    def __init__(self, identifier=None, configuration=None, engine=None):
+    def __init__(self, identifier=None, configuration=None, engine=None,
+                 max_terms_per_where=800):
         """
         Initialisation.
 
@@ -89,10 +109,14 @@ def __init__(self, identifier=None, configuration=None, engine=None):
                 with the additional "url" key pointing to the connection URL. See `open` documentation
                 for more details.
             engine (sqlalchemy.engine.Engine, optional): a pre-existing `SQLAlchemy.engine.Engine` instance.
-
+            max_terms_per_where (int): The max number of terms (s/p/o) in a call to
+                triples_choices to combine in one SQL "where" clause. Important for SQLite
+                back-end with SQLITE_MAX_EXPR_DEPTH limit and SQLITE_LIMIT_COMPOUND_SELECT
+                -- must find a balance that doesn't hit either of those.
         """
         self.identifier = identifier and identifier or "hardcoded"
         self.engine = engine
+        self.max_terms_per_where = max_terms_per_where
 
         # Use only the first 10 bytes of the digest
         self._interned_id = generate_interned_id(self.identifier)
@@ -383,25 +407,7 @@ def remove(self, triple, context):
                 _logger.exception("Removal failed.")
                 trans.rollback()
 
-    def triples(self, triple, context=None):
-        """
-        A generator over all the triples matching pattern.
-
-        Pattern can be any objects for comparing against nodes in
-        the store, for example, RegExLiteral, Date? DateRange?
-
-        quoted table:                <id>_quoted_statements
-        asserted rdf:type table:     <id>_type_statements
-        asserted non rdf:type table: <id>_asserted_statements
-
-        triple columns:
-            subject, predicate, object, context, termComb, objLanguage, objDatatype
-        class membership columns:
-            member, klass, context, termComb
-
-        FIXME:  These union all selects *may* be further optimized by joins
-
-        """
+    def _triples_helper(self, triple, context=None):
         subject, predicate, obj = triple
 
         quoted_table = self.tables["quoted_statements"]
@@ -427,10 +433,10 @@ class membership columns:
             # Literal partition if (obj is Literal or None) and asserted
             # non rdf:type partition (if obj is URIRef or None)
             selects = []
-            if not self.STRONGLY_TYPED_TERMS \
-                    or isinstance(obj, Literal) \
-                    or not obj \
-                    or (self.STRONGLY_TYPED_TERMS and isinstance(obj, REGEXTerm)):
+            if (not self.STRONGLY_TYPED_TERMS
+                    or isinstance(obj, Literal)
+                    or not obj
+                    or (self.STRONGLY_TYPED_TERMS and isinstance(obj, REGEXTerm))):
                 literal = expression.alias(literal_table, "literal")
                 clause = self.build_clause(literal, subject, predicate, obj, context)
                 selects.append((literal, clause, ASSERTED_LITERAL_PARTITION))
@@ -471,6 +477,9 @@ class membership columns:
             clause = self.build_clause(quoted, subject, predicate, obj, context)
             selects.append((quoted, clause, QUOTED_PARTITION))
 
+        return selects
+
+    def _do_triples_select(self, selects, context):
         q = union_select(selects, select_type=TRIPLE_SELECT_NO_ORDER)
         with self.engine.connect() as connection:
             res = connection.execute(q)
@@ -490,6 +499,29 @@ class membership columns:
         for (s, p, o), contexts in tripleCoverage.items():
             yield (s, p, o), (c for c in contexts)
 
+    def triples(self, triple, context=None):
+        """
+        A generator over all the triples matching pattern.
+
+        Pattern can be any objects for comparing against nodes in
+        the store, for example, RegExLiteral, Date? DateRange?
+
+        quoted table:                <id>_quoted_statements
+        asserted rdf:type table:     <id>_type_statements
+        asserted non rdf:type table: <id>_asserted_statements
+
+        triple columns:
+            subject, predicate, object, context, termComb, objLanguage, objDatatype
+        class membership columns:
+            member, klass, context, termComb
+
+        FIXME:  These union all selects *may* be further optimized by joins
+
+        """
+        selects = self._triples_helper(triple, context)
+        for m in self._do_triples_select(selects, context):
+            yield m
+
     def triples_choices(self, triple, context=None):
         """
         A variant of triples.
@@ -499,36 +531,40 @@ def triples_choices(self, triple, context=None):
         import default 'fallback' implementation, which will iterate over
         each term in the list and dispatch to triples.
         """
+        # We already support accepting a list for s/p/o
         subject, predicate, object_ = triple
-
+        selects = []
         if isinstance(object_, list):
             assert not isinstance(
                 subject, list), "object_ / subject are both lists"
             assert not isinstance(
                 predicate, list), "object_ / predicate are both lists"
             if not object_:
                 object_ = None
-            for (s1, p1, o1), cg in self.triples(
-                    (subject, predicate, object_), context):
-                yield (s1, p1, o1), cg
+            for o in grouper(object_, self.max_terms_per_where):
+                for sels in self._triples_helper((subject, predicate, o), context):
+                    selects.append(sels)
 
         elif isinstance(subject, list):
             assert not isinstance(
                 predicate, list), "subject / predicate are both lists"
             if not subject:
                 subject = None
-            for (s1, p1, o1), cg in self.triples(
-                    (subject, predicate, object_), context):
-                yield (s1, p1, o1), cg
+            for s in grouper(subject, self.max_terms_per_where):
+                for sels in self._triples_helper((s, predicate, object_), context):
+                    selects.append(sels)
 
         elif isinstance(predicate, list):
             assert not isinstance(
                 subject, list), "predicate / subject are both lists"
             if not predicate:
                 predicate = None
-            for (s1, p1, o1), cg in self.triples(
-                    (subject, predicate, object_), context):
-                yield (s1, p1, o1), cg
+            for p in grouper(predicate, self.max_terms_per_where):
+                for sels in self._triples_helper((subject, p, object_), context):
+                    selects.append(sels)
+
+        for m in self._do_triples_select(selects, context):
+            yield m
 
     def contexts(self, triple=None):
         quoted_table = self.tables["quoted_statements"]
@@ -759,9 +795,9 @@ def _get_build_command(self, triple, context=None, quoted=False):
             command_type = "type"
         return command_type, statement, params
 
-    def _remove_context(self, identifier):
+    def _remove_context(self, context):
         """Remove context."""
-        assert identifier
+        assert context
         quoted_table = self.tables["quoted_statements"]
         asserted_table = self.tables["asserted_statements"]
         asserted_type_table = self.tables["type_statements"]
@@ -772,7 +808,7 @@ def _remove_context(self, identifier):
             try:
                 for table in [quoted_table, asserted_table,
                               asserted_type_table, literal_table]:
-                    clause = self.build_context_clause(identifier, table)
+                    clause = self.build_context_clause(context, table)
                     connection.execute(table.delete(clause))
                 trans.commit()
             except Exception:
diff --git a/test/graph_case.py b/test/graph_case.py
@@ -1,10 +1,7 @@
 # -*- coding: utf-8 -*-
 import unittest
 
-from rdflib import Graph
-from rdflib import URIRef
-from rdflib import Literal
-from rdflib import plugin
+from rdflib import Graph, URIRef, Literal, plugin
 from rdflib.parser import StringInputSource
 from rdflib.py3compat import PY3
 from rdflib.store import Store
@@ -318,6 +315,25 @@ def testBindNamespace(self):
             "Unknown prefixes for namespace should be transformed to nsX"
         )
 
+    def testTriplesChoices(self):
+        likes = self.likes
+        pizza = self.pizza
+        cheese = self.cheese
+        tarek = self.tarek
+        michel = self.michel
+        bob = self.bob
+        self.addStuff()
+        trips = self.graph.triples_choices((None, likes, [pizza, cheese]))
+        self.assertEqual(
+            set(trips),
+            set([(tarek, likes, pizza),
+                 (tarek, likes, pizza),
+                 (tarek, likes, cheese),
+                 (michel, likes, pizza),
+                 (michel, likes, cheese),
+                 (bob, likes, cheese)])
+        )
+
 
 xmltestdoc = """<?xml version="1.0" encoding="UTF-8"?>
 <rdf:RDF
diff --git a/test/test_sqlalchemy.py b/test/test_sqlalchemy.py
@@ -1,9 +1,9 @@
 import unittest
 
 try:
-    from unittest.mock import patch
+    from unittest.mock import patch, MagicMock
 except ImportError:
-    from mock import patch
+    from mock import patch, MagicMock
 
 import six
 
@@ -16,6 +16,7 @@
 from rdflib.store import Store
 
 from rdflib_sqlalchemy import registerplugins
+from sqlalchemy.sql.selectable import Select
 
 
 michel = URIRef(u"michel")
@@ -57,7 +58,7 @@ class SQLATestCase(unittest.TestCase):
 
     def setUp(self):
         self.store = plugin.get(
-            "SQLAlchemy", Store)(identifier=self.identifier)
+            "SQLAlchemy", Store)(identifier=self.identifier, configuration=self.dburi)
         self.graph = ConjunctiveGraph(self.store, identifier=self.identifier)
         self.graph.open(self.dburi, create=True)
 
@@ -83,6 +84,13 @@ def test_namespaces(self):
     def test_contexts_without_triple(self):
         self.assertEqual(list(self.graph.contexts()), [])
 
+    def test_contexts_result(self):
+        ctx_id = URIRef('http://example.org/context')
+        g = self.graph.get_context(ctx_id)
+        g.add((michel, likes, pizza))
+        actual = list(self.store.contexts())
+        self.assertEqual(actual[0], ctx_id)
+
     def test_contexts_with_triple(self):
         statemnt = (michel, likes, pizza)
         self.assertEqual(list(self.graph.contexts(triple=statemnt)), [])
@@ -91,7 +99,29 @@ def test__len(self):
         self.assertEqual(self.store.__len__(), 0)
 
     def test__remove_context(self):
-        self.store._remove_context(self.identifier)
+        ctx_id = URIRef('http://example.org/context')
+        g = self.graph.get_context(ctx_id)
+        g.add((michel, likes, pizza))
+        self.store._remove_context(g)
+        self.assertEqual(list(self.store.contexts()), [])
+
+    def test_triples_choices(self):
+        # Create a mock for the sqlalchemy engine so we can capture the arguments
+        p = MagicMock(name='engine')
+        self.store.engine = p
+
+        # Set this so we're not including selects for both asserted and literal tables for
+        # a choice
+        self.store.STRONGLY_TYPED_TERMS = True
+        # Set the grouping of terms
+        self.store.max_terms_per_where = 2
+        # force execution of the generator
+        for x in self.store.triples_choices((None, likes, [michel, pizza, likes])):
+            pass
+        args = p.connect().__enter__().execute.call_args[0]
+        children = args[0].get_children(column_collections=False)
+        # Expect two selects: one for the first two choices plus one for the last one
+        self.assertEqual(sum(1 for c in children if isinstance(c, Select)), 2)
 
 
 if __name__ == "__main__":