Skip to content

Commit bc0ed54

Browse files
authored
Support QueryCondition on Dense Arrays (#1198)
1 parent f7145ed commit bc0ed54

File tree

6 files changed

+428
-67
lines changed

6 files changed

+428
-67
lines changed

HISTORY.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
* `setup.py` retrieves core version by using `ctypes` to call `tiledb_version` rather than parsing `tiledb_version.h` [#1191](https://github.com/TileDB-Inc/TileDB-Py/pull/1191)
55

66
## API Changes
7-
* Querying dense array with `[:]` returns shape that matches nonempty domain, consistent with `.df[:]` and .multi_index[:]` [#1199](https://github.com/TileDB-Inc/TileDB-Py/pull/1199)
7+
* Support `QueryCondition` for dense arrays [#1198](https://github.com/TileDB-Inc/TileDB-Py/pull/1198)
8+
* Querying dense array with `[:]` returns shape that matches nonempty domain, consistent with `.df[:]` and `.multi_index[:]` [#1199](https://github.com/TileDB-Inc/TileDB-Py/pull/1199)
89

910
# TileDB-Py 0.16.1 Release Notes
1011

@@ -17,9 +18,9 @@
1718
* TileDB-Py 0.16.0 includes TileDB Embedded [TileDB 2.10.0](https://github.com/TileDB-Inc/TileDB/releases/tag/2.10.0)
1819

1920
## API Changes
20-
* Addition of Filestore API [#1070](https://github.com/TileDB-Inc/TileDB-Py/pull/1070)
21+
* Addition of `Filestore` API [#1070](https://github.com/TileDB-Inc/TileDB-Py/pull/1070)
2122
* Use `bool` instead of `uint8` for Boolean dtype in `dataframe_.py` [#1154](https://github.com/TileDB-Inc/TileDB-Py/pull/1154)
22-
* Support QueryCondition OR operator [#1146](https://github.com/TileDB-Inc/TileDB-Py/pull/1146)
23+
* Support `QueryCondition` OR operator [#1146](https://github.com/TileDB-Inc/TileDB-Py/pull/1146)
2324

2425
# TileDB-Py 0.15.6 Release Notes
2526

examples/query_condition_dense.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# query_condition_dense.py
2+
#
3+
# LICENSE
4+
#
5+
# The MIT License
6+
#
7+
# Copyright (c) 2021 TileDB, Inc.
8+
#
9+
# Permission is hereby granted, free of charge, to any person obtaining a copy
10+
# of this software and associated documentation files (the "Software"), to deal
11+
# in the Software without restriction, including without limitation the rights
12+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13+
# copies of the Software, and to permit persons to whom the Software is
14+
# furnished to do so, subject to the following conditions:
15+
#
16+
# The above copyright notice and this permission notice shall be included in
17+
# all copies or substantial portions of the Software.
18+
#
19+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25+
# THE SOFTWARE.
26+
#
27+
28+
# This example creates an array with one string-typed attribute,
29+
# writes sample data to the array, and then prints out a filtered
30+
# dataframe using the TileDB QueryCondition feature.
31+
32+
import tiledb
33+
import numpy as np
34+
from pprint import pprint
35+
import tempfile
36+
import string
37+
38+
uri = "query_condition_dense"
39+
40+
41+
def create_array(path):
42+
# create a dense array
43+
dom = tiledb.Domain(
44+
tiledb.Dim(name="coords", domain=(1, 10), tile=1, dtype=np.uint32)
45+
)
46+
attrs = [
47+
tiledb.Attr(name="attr1", dtype=np.uint64),
48+
tiledb.Attr(name="attr2", dtype=np.float64),
49+
]
50+
schema = tiledb.ArraySchema(domain=dom, attrs=attrs, sparse=False)
51+
tiledb.Array.create(path, schema, overwrite=True)
52+
53+
# fill array with randomized values
54+
with tiledb.open(path, "w") as arr:
55+
rand = np.random.default_rng()
56+
arr[:] = {
57+
"attr1": rand.integers(low=0, high=10, size=10),
58+
"attr2": rand.random(size=10),
59+
}
60+
61+
62+
def read_array(path):
63+
with tiledb.open(uri) as arr:
64+
print("--- without query condition:")
65+
print()
66+
pprint(arr[:])
67+
print()
68+
69+
with tiledb.open(uri) as arr:
70+
qc = tiledb.QueryCondition("(2 < attr1 < 6) and (attr2 < 0.5 or attr2 > 0.85)")
71+
print(f"--- with query condition {qc}:")
72+
73+
print(f"--- the fill value for attr1 is {arr.attr('attr1').fill}")
74+
print(f"--- the fill value for attr1 is {arr.attr('attr2').fill}")
75+
76+
print()
77+
res = arr.query(attr_cond=qc)[:]
78+
pprint(res)
79+
80+
81+
if __name__ == "__main__":
82+
"""Example output for `python query_condition_dense.py`:
83+
84+
--- without query condition:
85+
86+
OrderedDict([('attr1', array([4, 0, 9, 7, 6, 0, 0, 5, 7, 5], dtype=uint64)),
87+
('attr2',
88+
array([0.74476144, 0.47211544, 0.99054245, 0.36640416, 0.91699594,
89+
0.06216043, 0.58581863, 0.00505695, 0.7486192 , 0.87649422]))])
90+
91+
--- with query condition QueryCondition(expression='(2 < attr1 < 6) and (attr2 < 0.5 or attr2 > 0.85)'):
92+
--- the fill value for attr1 is [18446744073709551615]
93+
--- the fill value for attr1 is [nan]
94+
95+
OrderedDict([('attr1',
96+
array([18446744073709551615, 18446744073709551615, 18446744073709551615,
97+
18446744073709551615, 18446744073709551615, 18446744073709551615,
98+
18446744073709551615, 5, 18446744073709551615,
99+
5], dtype=uint64)),
100+
('attr2',
101+
array([ nan, nan, nan, nan, nan,
102+
nan, nan, 0.00505695, nan, 0.87649422]))])
103+
"""
104+
create_array(uri)
105+
read_array(uri)

examples/query_condition_sparse.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# query_condition_sparse.py
2+
#
3+
# LICENSE
4+
#
5+
# The MIT License
6+
#
7+
# Copyright (c) 2021 TileDB, Inc.
8+
#
9+
# Permission is hereby granted, free of charge, to any person obtaining a copy
10+
# of this software and associated documentation files (the "Software"), to deal
11+
# in the Software without restriction, including without limitation the rights
12+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13+
# copies of the Software, and to permit persons to whom the Software is
14+
# furnished to do so, subject to the following conditions:
15+
#
16+
# The above copyright notice and this permission notice shall be included in
17+
# all copies or substantial portions of the Software.
18+
#
19+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25+
# THE SOFTWARE.
26+
#
27+
28+
# This example creates an array with one string-typed attribute,
29+
# writes sample data to the array, and then prints out a filtered
30+
# dataframe using the TileDB QueryCondition feature.
31+
32+
import tiledb
33+
import numpy as np
34+
from pprint import pprint
35+
import tempfile
36+
import string
37+
38+
uri = "query_condition_sparse"
39+
40+
41+
def create_array(path):
42+
# create a sparse array
43+
dom = tiledb.Domain(
44+
tiledb.Dim(name="coords", domain=(1, 10), tile=1, dtype=np.uint32)
45+
)
46+
attrs = [
47+
tiledb.Attr(name="attr1", dtype=np.uint64),
48+
tiledb.Attr(name="attr2", dtype=np.float64),
49+
]
50+
schema = tiledb.ArraySchema(domain=dom, attrs=attrs, sparse=True)
51+
tiledb.Array.create(path, schema, overwrite=True)
52+
53+
# fill array with randomized values
54+
with tiledb.open(path, "w") as arr:
55+
rand = np.random.default_rng()
56+
arr[np.arange(1, 11)] = {
57+
"attr1": rand.integers(low=0, high=10, size=10),
58+
"attr2": rand.random(size=10),
59+
}
60+
61+
62+
def read_array(path):
63+
with tiledb.open(uri) as arr:
64+
print("--- without query condition:")
65+
print()
66+
pprint(arr[:])
67+
print()
68+
69+
with tiledb.open(uri) as arr:
70+
qc = tiledb.QueryCondition("(2 < attr1 < 6) and (attr2 < 0.5 or attr2 > 0.85)")
71+
print(f"--- with query condition {qc}:")
72+
print()
73+
res = arr.query(attr_cond=qc)[:]
74+
pprint(res)
75+
76+
77+
if __name__ == "__main__":
78+
"""Example output for `python query_condition_sparse.py`:
79+
80+
--- without query condition:
81+
82+
OrderedDict([('attr1', array([2, 4, 4, 3, 4, 7, 5, 2, 2, 8], dtype=uint64)),
83+
('attr2',
84+
array([0.62445071, 0.32415481, 0.39117764, 0.66609931, 0.48122102,
85+
0.93561984, 0.70998524, 0.10322076, 0.28343041, 0.33623958])),
86+
('coords',
87+
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=uint32))])
88+
89+
--- with query condition QueryCondition(expression='(2 < attr1 < 6) and (attr2 < 0.5 or attr2 > 0.85)'):
90+
91+
OrderedDict([('attr1', array([4, 4, 4], dtype=uint64)),
92+
('attr2', array([0.32415481, 0.39117764, 0.48122102])),
93+
('coords', array([2, 3, 5], dtype=uint32))])
94+
"""
95+
create_array(uri)
96+
read_array(uri)

tiledb/libtiledb.pyx

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ _tiledb_dtype_to_numpy_typeid_convert ={
109109
TILEDB_INT16: np.NPY_INT16,
110110
TILEDB_UINT16: np.NPY_UINT16,
111111
TILEDB_CHAR: np.NPY_STRING,
112+
TILEDB_STRING_ASCII: np.NPY_STRING,
112113
TILEDB_STRING_UTF8: np.NPY_UNICODE,
113114
}
114115
IF LIBTILEDB_VERSION_MAJOR >= 2:
@@ -131,7 +132,7 @@ _tiledb_dtype_to_numpy_dtype_convert = {
131132
TILEDB_INT16: np.int16,
132133
TILEDB_UINT16: np.uint16,
133134
TILEDB_CHAR: np.dtype('S1'),
134-
TILEDB_STRING_ASCII: np.bytes_,
135+
TILEDB_STRING_ASCII: np.dtype('S'),
135136
TILEDB_STRING_UTF8: np.dtype('U1'),
136137
}
137138
IF LIBTILEDB_VERSION_MAJOR >= 2:
@@ -4102,8 +4103,6 @@ cdef class Query(object):
41024103
raise TileDBError(f"Selected attribute does not exist: '{name}'")
41034104
self.attrs = attrs
41044105
self.attr_cond = attr_cond
4105-
if attr_cond is not None and not array.schema.sparse:
4106-
raise TileDBError("QueryConditions may only be applied to sparse arrays")
41074106

41084107
if order == None:
41094108
if array.schema.sparse:

tiledb/query_condition.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,18 @@
2121
class QueryCondition:
2222
"""
2323
Class representing a TileDB query condition object for attribute filtering
24-
pushdown. Set the query condition with a string representing an expression
24+
pushdown.
25+
26+
When querying a sparse array, only the values that satisfy the given
27+
condition are returned (coupled with their associated coordinates). An example
28+
may be found in `examples/query_condition_sparse.py`.
29+
30+
For dense arrays, the given shape of the query matches the shape of the output
31+
array. Values that DO NOT satisfy the given condition are filled with the
32+
TileDB default fill value. Different attribute types have different default
33+
fill values as outlined here (https://docs.tiledb.com/main/background/internal-mechanics/writing#default-fill-values). An example may be found in `examples/query_condition_dense.py`.
34+
35+
Set the query condition with a string representing an expression
2536
as defined by the grammar below. A more straight forward example of usage is
2637
given beneath.
2738

0 commit comments

Comments
 (0)