Skip to content

Commit f361d8f

Browse files
authored
Use Arrow Type Fixed-Width Binary ("w:") For Fixed-Width TILEDB_CHAR (#1286)
* Previously, this errored out with `ArrowInvalid: Expected 3 buffers for imported type large_binary, ArrowArray struct has 2` * Fixed-width `TILEDB_CHAR` was using type large binary ("Z") which requires 3 buffers, including an offset buffer, that is not present or necessary for fixed-width binary strings * This does not handle fixed-width `TILEDB_UTF8` or `TILEDB_ASCII`
1 parent 522bb86 commit f361d8f

File tree

3 files changed

+34
-4
lines changed

3 files changed

+34
-4
lines changed

HISTORY.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
# In Progress
22

33
## Bug Fixes
4-
* Fix issue where querying an array with a Boolean type when `arrow=True`, but is unselected in `.query(attr=...)`, results in an error `pyarrow.lib.ArrowInvalid: Invalid column index to set field.` []()
4+
* Fix issue where querying an array with a Boolean type when `arrow=True`, but is unselected in `.query(attr=...)`, results in an error `pyarrow.lib.ArrowInvalid: Invalid column index to set field.` [#1291](https://github.com/TileDB-Inc/TileDB-Py/pull/1291)
5+
* Use Arrow type fixed-width binary ("w:") for non-variable TILEDB_CHAR [#1286](https://github.com/TileDB-Inc/TileDB-Py/pull/1286)
56

67
# TileDB-Py 0.17.1 Release Notes
78

tiledb/py_arrow_io_impl.h

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,9 @@ ArrowInfo tiledb_buffer_arrow_fmt(BufferInfo bufferinfo, bool use_list = true) {
183183
return ArrowInfo("U");
184184
}
185185
case TILEDB_CHAR:
186-
if (bufferinfo.offsets_elem_size == 4) {
186+
if (!bufferinfo.is_var) {
187+
return ArrowInfo("w:" + std::to_string(cell_val_num));
188+
} else if (bufferinfo.offsets_elem_size == 4) {
187189
return ArrowInfo("z");
188190
} else {
189191
return ArrowInfo("Z");
@@ -304,7 +306,10 @@ TypeInfo arrow_type_to_tiledb(ArrowSchema *arw_schema) {
304306
return {TILEDB_DATETIME_NS, 8, 1, large};
305307
else if (fmt == "z" || fmt == "Z")
306308
return {TILEDB_CHAR, 1, TILEDB_VAR_NUM, fmt == "Z"};
307-
else if (fmt == "u" || fmt == "U")
309+
else if (fmt.rfind("w:", 0) == 0) {
310+
uint32_t cell_val_num = atoi(fmt.substr(2).c_str());
311+
return {TILEDB_CHAR, 1, cell_val_num, fmt == "w"};
312+
} else if (fmt == "u" || fmt == "U")
308313
return {TILEDB_STRING_UTF8, 1, TILEDB_VAR_NUM, fmt == "U"};
309314
#if TILEDB_VERSION_MAJOR >= 2 && TILEDB_VERSION_MINOR >= 10
310315
else if (fmt == "b")
@@ -769,7 +774,14 @@ void ArrowExporter::export_(const std::string &name, ArrowArray *array,
769774
// adjust for arrow offset unless empty result
770775
elem_num = (bufferinfo.offsets_num == 0) ? 0 : bufferinfo.offsets_num - 1;
771776
} else {
772-
elem_num = bufferinfo.data_num;
777+
if (arrow_fmt.fmt_.rfind("w:", 0) == 0) {
778+
// for Arrow fixed-width binary (non-variable TILEDB_CHAR), we need to
779+
// take the size of the entire buffer and divide by the size of each
780+
// element
781+
elem_num = bufferinfo.data_num / bufferinfo.tdbtype.cell_val_num;
782+
} else {
783+
elem_num = bufferinfo.data_num;
784+
}
773785
}
774786

775787
auto cpp_arrow_array = new CPPArrowArray(elem_num, // elem_num

tiledb/tests/test_multi_index.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -888,6 +888,23 @@ def test_multi_index_timing(self):
888888
assert "py.getitem_time.pandas_index_update_time :" in internal_stats
889889
tiledb.stats_disable()
890890

891+
@pytest.mark.skipif(not has_pandas(), reason="pandas not installed")
892+
def test_fixed_width_char(self):
893+
uri = self.path("test_fixed_width_char")
894+
schema = tiledb.ArraySchema(
895+
domain=tiledb.Domain(tiledb.Dim(name="dim", domain=(0, 2), dtype=np.uint8)),
896+
attrs=[tiledb.Attr(dtype="|S3")],
897+
)
898+
tiledb.Array.create(uri, schema)
899+
900+
data = np.array(["cat", "dog", "hog"], dtype="|S3")
901+
902+
with tiledb.open(uri, mode="w") as A:
903+
A[:] = data
904+
905+
with tiledb.open(uri, mode="r") as A:
906+
assert all(A.query(use_arrow=True).df[:][""] == data)
907+
891908

892909
# parametrize dtype and sparse
893910
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)