Skip to content

Commit 84b8254

Browse files
committed
Using groups of choices in triples_choices (#43)
- SQLite only allows a certain number of terms in a WHERE clause. This change allows for calls to addN to always come under that limit. - Also, nenaming _remove_context param to match actual usage - Improving test for _remove_context(), and adding tests for triples_choices(), and contexts()
1 parent f3a33f7 commit 84b8254

File tree

3 files changed

+128
-46
lines changed

3 files changed

+128
-46
lines changed

rdflib_sqlalchemy/store.py

Lines changed: 74 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,25 @@
4848
Any = None
4949

5050

51+
def grouper(iterable, n):
52+
"Collect data into chunks of at most n elements"
53+
assert n > 0, 'Cannot group into chunks of zero elements'
54+
lst = []
55+
iterable = iter(iterable)
56+
while True:
57+
try:
58+
lst.append(next(iterable))
59+
except StopIteration:
60+
break
61+
62+
if len(lst) == n:
63+
yield lst
64+
lst = []
65+
66+
if lst:
67+
yield lst
68+
69+
5170
def generate_interned_id(identifier):
5271
return "{prefix}{identifier_hash}".format(
5372
prefix=INTERNED_PREFIX,
@@ -78,7 +97,8 @@ class SQLAlchemy(Store, SQLGeneratorMixin, StatisticsMixin):
7897
regex_matching = PYTHON_REGEX
7998
configuration = Literal("sqlite://")
8099

81-
def __init__(self, identifier=None, configuration=None, engine=None):
100+
def __init__(self, identifier=None, configuration=None, engine=None,
101+
max_terms_per_where=800):
82102
"""
83103
Initialisation.
84104
@@ -89,10 +109,14 @@ def __init__(self, identifier=None, configuration=None, engine=None):
89109
with the additional "url" key pointing to the connection URL. See `open` documentation
90110
for more details.
91111
engine (sqlalchemy.engine.Engine, optional): a pre-existing `SQLAlchemy.engine.Engine` instance.
92-
112+
max_terms_per_where (int): The max number of terms (s/p/o) in a call to
113+
triples_choices to combine in one SQL "where" clause. Important for SQLite
114+
back-end with SQLITE_MAX_EXPR_DEPTH limit and SQLITE_LIMIT_COMPOUND_SELECT
115+
-- must find a balance that doesn't hit either of those.
93116
"""
94117
self.identifier = identifier and identifier or "hardcoded"
95118
self.engine = engine
119+
self.max_terms_per_where = max_terms_per_where
96120

97121
# Use only the first 10 bytes of the digest
98122
self._interned_id = generate_interned_id(self.identifier)
@@ -383,25 +407,7 @@ def remove(self, triple, context):
383407
_logger.exception("Removal failed.")
384408
trans.rollback()
385409

386-
def triples(self, triple, context=None):
387-
"""
388-
A generator over all the triples matching pattern.
389-
390-
Pattern can be any objects for comparing against nodes in
391-
the store, for example, RegExLiteral, Date? DateRange?
392-
393-
quoted table: <id>_quoted_statements
394-
asserted rdf:type table: <id>_type_statements
395-
asserted non rdf:type table: <id>_asserted_statements
396-
397-
triple columns:
398-
subject, predicate, object, context, termComb, objLanguage, objDatatype
399-
class membership columns:
400-
member, klass, context, termComb
401-
402-
FIXME: These union all selects *may* be further optimized by joins
403-
404-
"""
410+
def _triples_helper(self, triple, context=None):
405411
subject, predicate, obj = triple
406412

407413
quoted_table = self.tables["quoted_statements"]
@@ -427,10 +433,10 @@ class membership columns:
427433
# Literal partition if (obj is Literal or None) and asserted
428434
# non rdf:type partition (if obj is URIRef or None)
429435
selects = []
430-
if not self.STRONGLY_TYPED_TERMS \
431-
or isinstance(obj, Literal) \
432-
or not obj \
433-
or (self.STRONGLY_TYPED_TERMS and isinstance(obj, REGEXTerm)):
436+
if (not self.STRONGLY_TYPED_TERMS
437+
or isinstance(obj, Literal)
438+
or not obj
439+
or (self.STRONGLY_TYPED_TERMS and isinstance(obj, REGEXTerm))):
434440
literal = expression.alias(literal_table, "literal")
435441
clause = self.build_clause(literal, subject, predicate, obj, context)
436442
selects.append((literal, clause, ASSERTED_LITERAL_PARTITION))
@@ -471,6 +477,9 @@ class membership columns:
471477
clause = self.build_clause(quoted, subject, predicate, obj, context)
472478
selects.append((quoted, clause, QUOTED_PARTITION))
473479

480+
return selects
481+
482+
def _do_triples_select(self, selects, context):
474483
q = union_select(selects, select_type=TRIPLE_SELECT_NO_ORDER)
475484
with self.engine.connect() as connection:
476485
res = connection.execute(q)
@@ -490,6 +499,29 @@ class membership columns:
490499
for (s, p, o), contexts in tripleCoverage.items():
491500
yield (s, p, o), (c for c in contexts)
492501

502+
def triples(self, triple, context=None):
503+
"""
504+
A generator over all the triples matching pattern.
505+
506+
Pattern can be any objects for comparing against nodes in
507+
the store, for example, RegExLiteral, Date? DateRange?
508+
509+
quoted table: <id>_quoted_statements
510+
asserted rdf:type table: <id>_type_statements
511+
asserted non rdf:type table: <id>_asserted_statements
512+
513+
triple columns:
514+
subject, predicate, object, context, termComb, objLanguage, objDatatype
515+
class membership columns:
516+
member, klass, context, termComb
517+
518+
FIXME: These union all selects *may* be further optimized by joins
519+
520+
"""
521+
selects = self._triples_helper(triple, context)
522+
for m in self._do_triples_select(selects, context):
523+
yield m
524+
493525
def triples_choices(self, triple, context=None):
494526
"""
495527
A variant of triples.
@@ -499,36 +531,40 @@ def triples_choices(self, triple, context=None):
499531
import default 'fallback' implementation, which will iterate over
500532
each term in the list and dispatch to triples.
501533
"""
534+
# We already support accepting a list for s/p/o
502535
subject, predicate, object_ = triple
503-
536+
selects = []
504537
if isinstance(object_, list):
505538
assert not isinstance(
506539
subject, list), "object_ / subject are both lists"
507540
assert not isinstance(
508541
predicate, list), "object_ / predicate are both lists"
509542
if not object_:
510543
object_ = None
511-
for (s1, p1, o1), cg in self.triples(
512-
(subject, predicate, object_), context):
513-
yield (s1, p1, o1), cg
544+
for o in grouper(object_, self.max_terms_per_where):
545+
for sels in self._triples_helper((subject, predicate, o), context):
546+
selects.append(sels)
514547

515548
elif isinstance(subject, list):
516549
assert not isinstance(
517550
predicate, list), "subject / predicate are both lists"
518551
if not subject:
519552
subject = None
520-
for (s1, p1, o1), cg in self.triples(
521-
(subject, predicate, object_), context):
522-
yield (s1, p1, o1), cg
553+
for s in grouper(subject, self.max_terms_per_where):
554+
for sels in self._triples_helper((s, predicate, object_), context):
555+
selects.append(sels)
523556

524557
elif isinstance(predicate, list):
525558
assert not isinstance(
526559
subject, list), "predicate / subject are both lists"
527560
if not predicate:
528561
predicate = None
529-
for (s1, p1, o1), cg in self.triples(
530-
(subject, predicate, object_), context):
531-
yield (s1, p1, o1), cg
562+
for p in grouper(predicate, self.max_terms_per_where):
563+
for sels in self._triples_helper((subject, p, object_), context):
564+
selects.append(sels)
565+
566+
for m in self._do_triples_select(selects, context):
567+
yield m
532568

533569
def contexts(self, triple=None):
534570
quoted_table = self.tables["quoted_statements"]
@@ -759,9 +795,9 @@ def _get_build_command(self, triple, context=None, quoted=False):
759795
command_type = "type"
760796
return command_type, statement, params
761797

762-
def _remove_context(self, identifier):
798+
def _remove_context(self, context):
763799
"""Remove context."""
764-
assert identifier
800+
assert context
765801
quoted_table = self.tables["quoted_statements"]
766802
asserted_table = self.tables["asserted_statements"]
767803
asserted_type_table = self.tables["type_statements"]
@@ -772,7 +808,7 @@ def _remove_context(self, identifier):
772808
try:
773809
for table in [quoted_table, asserted_table,
774810
asserted_type_table, literal_table]:
775-
clause = self.build_context_clause(identifier, table)
811+
clause = self.build_context_clause(context, table)
776812
connection.execute(table.delete(clause))
777813
trans.commit()
778814
except Exception:

test/graph_case.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
11
# -*- coding: utf-8 -*-
22
import unittest
33

4-
from rdflib import Graph
5-
from rdflib import URIRef
6-
from rdflib import Literal
7-
from rdflib import plugin
4+
from rdflib import Graph, URIRef, Literal, plugin
85
from rdflib.parser import StringInputSource
96
from rdflib.py3compat import PY3
107
from rdflib.store import Store
@@ -318,6 +315,25 @@ def testBindNamespace(self):
318315
"Unknown prefixes for namespace should be transformed to nsX"
319316
)
320317

318+
def testTriplesChoices(self):
319+
likes = self.likes
320+
pizza = self.pizza
321+
cheese = self.cheese
322+
tarek = self.tarek
323+
michel = self.michel
324+
bob = self.bob
325+
self.addStuff()
326+
trips = self.graph.triples_choices((None, likes, [pizza, cheese]))
327+
self.assertEqual(
328+
set(trips),
329+
set([(tarek, likes, pizza),
330+
(tarek, likes, pizza),
331+
(tarek, likes, cheese),
332+
(michel, likes, pizza),
333+
(michel, likes, cheese),
334+
(bob, likes, cheese)])
335+
)
336+
321337

322338
xmltestdoc = """<?xml version="1.0" encoding="UTF-8"?>
323339
<rdf:RDF

test/test_sqlalchemy.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import unittest
22

33
try:
4-
from unittest.mock import patch
4+
from unittest.mock import patch, MagicMock
55
except ImportError:
6-
from mock import patch
6+
from mock import patch, MagicMock
77

88
import six
99

@@ -16,6 +16,7 @@
1616
from rdflib.store import Store
1717

1818
from rdflib_sqlalchemy import registerplugins
19+
from sqlalchemy.sql.selectable import Select
1920

2021

2122
michel = URIRef(u"michel")
@@ -57,7 +58,7 @@ class SQLATestCase(unittest.TestCase):
5758

5859
def setUp(self):
5960
self.store = plugin.get(
60-
"SQLAlchemy", Store)(identifier=self.identifier)
61+
"SQLAlchemy", Store)(identifier=self.identifier, configuration=self.dburi)
6162
self.graph = ConjunctiveGraph(self.store, identifier=self.identifier)
6263
self.graph.open(self.dburi, create=True)
6364

@@ -83,6 +84,13 @@ def test_namespaces(self):
8384
def test_contexts_without_triple(self):
8485
self.assertEqual(list(self.graph.contexts()), [])
8586

87+
def test_contexts_result(self):
88+
ctx_id = URIRef('http://example.org/context')
89+
g = self.graph.get_context(ctx_id)
90+
g.add((michel, likes, pizza))
91+
actual = list(self.store.contexts())
92+
self.assertEqual(actual[0], ctx_id)
93+
8694
def test_contexts_with_triple(self):
8795
statemnt = (michel, likes, pizza)
8896
self.assertEqual(list(self.graph.contexts(triple=statemnt)), [])
@@ -91,7 +99,29 @@ def test__len(self):
9199
self.assertEqual(self.store.__len__(), 0)
92100

93101
def test__remove_context(self):
94-
self.store._remove_context(self.identifier)
102+
ctx_id = URIRef('http://example.org/context')
103+
g = self.graph.get_context(ctx_id)
104+
g.add((michel, likes, pizza))
105+
self.store._remove_context(g)
106+
self.assertEqual(list(self.store.contexts()), [])
107+
108+
def test_triples_choices(self):
109+
# Create a mock for the sqlalchemy engine so we can capture the arguments
110+
p = MagicMock(name='engine')
111+
self.store.engine = p
112+
113+
# Set this so we're not including selects for both asserted and literal tables for
114+
# a choice
115+
self.store.STRONGLY_TYPED_TERMS = True
116+
# Set the grouping of terms
117+
self.store.max_terms_per_where = 2
118+
# force execution of the generator
119+
for x in self.store.triples_choices((None, likes, [michel, pizza, likes])):
120+
pass
121+
args = p.connect().__enter__().execute.call_args[0]
122+
children = args[0].get_children(column_collections=False)
123+
# Expect two selects: one for the first two choices plus one for the last one
124+
self.assertEqual(sum(1 for c in children if isinstance(c, Select)), 2)
95125

96126

97127
if __name__ == "__main__":

0 commit comments

Comments
 (0)