Skip to content

Commit 1b63921

Browse files
authored
Correct Nullable String Attributes and All NULL Data (#1848)
* `np.nan_to_num` cannot be used on string arrays * Only convert `attr_val` if nullable * Support All Null Data
1 parent be6cf1b commit 1b63921

File tree

2 files changed

+92
-23
lines changed

2 files changed

+92
-23
lines changed

tiledb/libtiledb.pyx

Lines changed: 53 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,16 @@ cdef _write_array(
152152

153153
if attr.isvar:
154154
try:
155-
buffer, offsets = array_to_buffer(values[i], True, False)
155+
if attr.isnullable:
156+
if(np.issubdtype(attr.dtype, np.unicode_)
157+
or np.issubdtype(attr.dtype, np.string_)
158+
or np.issubdtype(attr.dtype, np.bytes_)):
159+
attr_val = np.array(["" if v is None else v for v in values[i]])
160+
else:
161+
attr_val = np.nan_to_num(values[i])
162+
else:
163+
attr_val = values[i]
164+
buffer, offsets = array_to_buffer(attr_val, True, False)
156165
except Exception as exc:
157166
raise type(exc)(f"Failed to convert buffer for attribute: '{attr.name}'") from exc
158167
buffer_offsets_sizes[i] = offsets.nbytes
@@ -2241,6 +2250,7 @@ cdef class DenseArrayImpl(Array):
22412250
selection_tuple = (selection,) if not isinstance(selection, tuple) else selection
22422251
self._setitem_impl(selection, val, dict())
22432252

2253+
22442254
def _setitem_impl(self, object selection, object val, dict nullmaps):
22452255
"""Implementation for setitem with optional support for validity bitmaps."""
22462256
from .subarray import Subarray
@@ -2300,16 +2310,33 @@ cdef class DenseArrayImpl(Array):
23002310
# ensure that the value is array-convertible, for example: pandas.Series
23012311
attr_val = np.asarray(attr_val)
23022312
if attr.isnullable and name not in nullmaps:
2303-
nullmaps[name] = np.array([int(v is not None) for v in attr_val], dtype=np.uint8)
2313+
nullmaps[name] = np.array(
2314+
[int(v is not None) for v in attr_val],
2315+
dtype=np.uint8
2316+
)
23042317
else:
23052318
if (np.issubdtype(attr.dtype, np.string_) and not
23062319
(np.issubdtype(attr_val.dtype, np.string_) or attr_val.dtype == np.dtype('O'))):
23072320
raise ValueError("Cannot write a string value to non-string "
23082321
"typed attribute '{}'!".format(name))
23092322

23102323
if attr.isnullable and name not in nullmaps:
2311-
nullmaps[name] = ~np.ma.masked_invalid(attr_val).mask
2312-
attr_val = np.nan_to_num(attr_val)
2324+
try:
2325+
nullmaps[name] = ~np.ma.masked_invalid(attr_val).mask
2326+
except Exception as exc:
2327+
attr_val = np.asarray(attr_val)
2328+
nullmaps[name] = np.array(
2329+
[int(v is not None) for v in attr_val],
2330+
dtype=np.uint8
2331+
)
2332+
2333+
if np.issubdtype(attr.dtype, np.string_):
2334+
attr_val = np.array(
2335+
["" if v is None else v for v in attr_val])
2336+
else:
2337+
attr_val = np.nan_to_num(attr_val)
2338+
attr_val = np.array(
2339+
[0 if v is None else v for v in attr_val])
23132340
attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype)
23142341
except Exception as exc:
23152342
raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc
@@ -2335,7 +2362,7 @@ cdef class DenseArrayImpl(Array):
23352362
# ensure that the value is array-convertible, for example: pandas.Series
23362363
val = np.asarray(val)
23372364
if attr.isnullable and name not in nullmaps:
2338-
nullmaps[name] = np.array([int(v is not None) for v in val], dtype=np.uint8)
2365+
nullmaps[name] = np.array([int(v is None) for v in val], dtype=np.uint8)
23392366
else:
23402367
if (np.issubdtype(attr.dtype, np.string_) and not
23412368
(np.issubdtype(val.dtype, np.string_) or val.dtype == np.dtype('O'))):
@@ -2817,22 +2844,34 @@ def _setitem_impl_sparse(self: Array, selection, val, dict nullmaps):
28172844
attr_val = val[name]
28182845

28192846
try:
2847+
# ensure that the value is array-convertible, for example: pandas.Series
2848+
attr_val = np.asarray(attr_val)
2849+
28202850
if attr.isvar:
2821-
# ensure that the value is array-convertible, for example: pandas.Series
2822-
attr_val = np.asarray(attr_val)
28232851
if attr.isnullable and name not in nullmaps:
2824-
nullmaps[name] = np.array([int(v is not None) for v in attr_val], dtype=np.uint8)
2852+
nullmaps[name] = np.array(
2853+
[int(v is not None) for v in attr_val], dtype=np.uint8)
28252854
else:
2826-
if (np.issubdtype(attr.dtype, np.string_) and not
2827-
(np.issubdtype(attr_val.dtype, np.string_) or attr_val.dtype == np.dtype('O'))):
2855+
if (np.issubdtype(attr.dtype, np.string_)
2856+
and not (np.issubdtype(attr_val.dtype, np.string_)
2857+
or attr_val.dtype == np.dtype('O'))):
28282858
raise ValueError("Cannot write a string value to non-string "
2829-
"typed attribute '{}'!".format(name))
2859+
"typed attribute '{}'!".format(name))
28302860

28312861
if attr.isnullable and name not in nullmaps:
2832-
nullmaps[name] = ~np.ma.masked_invalid(attr_val).mask
2833-
attr_val = np.nan_to_num(attr_val)
2862+
try:
2863+
nullmaps[name] = ~np.ma.masked_invalid(attr_val).mask
2864+
except Exception as exc:
2865+
nullmaps[name] = np.array(
2866+
[int(v is not None) for v in attr_val], dtype=np.uint8)
2867+
2868+
if np.issubdtype(attr.dtype, np.string_):
2869+
attr_val = np.array(["" if v is None else v for v in attr_val])
2870+
else:
2871+
attr_val = np.nan_to_num(attr_val)
2872+
attr_val = np.array([0 if v is None else v for v in attr_val])
28342873
attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype)
2835-
2874+
28362875
except Exception as exc:
28372876
raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc
28382877

tiledb/tests/test_libtiledb.py

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -393,26 +393,56 @@ def test_array_write_nullable(self, sparse, pass_df):
393393

394394
uri = self.path("test_array_write_nullable")
395395
dom = tiledb.Domain(tiledb.Dim("d", domain=(1, 5), dtype="int64"))
396-
att = tiledb.Attr("a", dtype="int8", nullable=True)
397-
schema = tiledb.ArraySchema(domain=dom, attrs=[att], sparse=sparse)
396+
att1 = tiledb.Attr("a1", dtype="int8", nullable=True)
397+
att2 = tiledb.Attr("a2", dtype="str", nullable=True)
398+
schema = tiledb.ArraySchema(domain=dom, attrs=[att1, att2], sparse=sparse)
398399
tiledb.Array.create(uri, schema)
399400

400401
with tiledb.open(uri, "w") as A:
401402
dims = pa.array([1, 2, 3, 4, 5])
402-
data = pa.array([1.0, 2.0, None, 0, 1.0])
403+
data1 = pa.array([1.0, 2.0, None, 0, 1.0])
404+
data2 = pa.array(["a", "b", None, None, "c"])
403405
if pass_df:
404406
dims = dims.to_pandas()
405-
data = data.to_pandas()
407+
data1 = data1.to_pandas()
408+
data2 = data2.to_pandas()
406409

407410
if sparse:
408-
A[dims] = data
411+
A[dims] = {"a1": data1, "a2": data2}
409412
else:
410-
A[:] = data
413+
A[:] = {"a1": data1, "a2": data2}
411414

412415
with tiledb.open(uri, "r") as A:
413-
expected_validity = [False, False, True, False, False]
414-
assert_array_equal(A[:]["a"].mask, expected_validity)
415-
assert_array_equal(A.df[:]["a"].isna(), expected_validity)
416+
expected_validity1 = [False, False, True, False, False]
417+
assert_array_equal(A[:]["a1"].mask, expected_validity1)
418+
assert_array_equal(A.df[:]["a1"].isna(), expected_validity1)
419+
420+
expected_validity2 = [False, False, True, True, False]
421+
assert_array_equal(A[:]["a2"].mask, expected_validity2)
422+
assert_array_equal(A.df[:]["a2"].isna(), expected_validity2)
423+
424+
with tiledb.open(uri, "w") as A:
425+
dims = pa.array([1, 2, 3, 4, 5])
426+
data1 = pa.array([None, None, None, None, None])
427+
data2 = pa.array([None, None, None, None, None])
428+
if pass_df:
429+
dims = dims.to_pandas()
430+
data1 = data1.to_pandas()
431+
data2 = data2.to_pandas()
432+
433+
if sparse:
434+
A[dims] = {"a1": data1, "a2": data2}
435+
else:
436+
A[:] = {"a1": data1, "a2": data2}
437+
438+
with tiledb.open(uri, "r") as A:
439+
expected_validity1 = [True, True, True, True, True]
440+
assert_array_equal(A[:]["a1"].mask, expected_validity1)
441+
assert_array_equal(A.df[:]["a1"].isna(), expected_validity1)
442+
443+
expected_validity2 = [True, True, True, True, True]
444+
assert_array_equal(A[:]["a2"].mask, expected_validity2)
445+
assert_array_equal(A.df[:]["a2"].isna(), expected_validity2)
416446

417447

418448
class DenseArrayTest(DiskTestCase):

0 commit comments

Comments
 (0)