From 94eeeff7cc19b051d590f138038de3c76a8704af Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Fri, 4 Apr 2025 08:12:45 -0400 Subject: [PATCH 01/52] tiledb_query_add_predicate --- examples/c_api/query_add_predicate.c | 502 ++++++++++++++++++ test/CMakeLists.txt | 1 + test/src/unit-query-add-predicate.cc | 442 +++++++++++++++ test/src/unit-sparse-global-order-reader.cc | 75 +-- test/support/CMakeLists.txt | 1 + test/support/src/array_templates.h | 115 ++++ test/support/src/query_helpers.cc | 102 ++++ test/support/src/query_helpers.h | 50 ++ test/support/src/vfs_helpers.h | 2 +- tiledb/oxidize/CMakeLists.txt | 10 +- tiledb/oxidize/Cargo.lock | 18 + tiledb/oxidize/Cargo.toml | 2 + tiledb/oxidize/arrow/src/lib.rs | 12 +- tiledb/oxidize/arrow/src/record_batch.rs | 107 +++- tiledb/oxidize/arrow/src/schema.rs | 87 ++- .../oxidize/cxx-interface/cc/array_schema.h | 4 + .../cxx-interface/src/sm/array_schema/mod.rs | 53 +- tiledb/oxidize/expr/Cargo.toml | 1 + tiledb/oxidize/expr/src/lib.rs | 14 +- tiledb/oxidize/expr/src/logical_expr.rs | 78 ++- tiledb/oxidize/expr/src/physical_expr.rs | 2 +- tiledb/oxidize/expr/src/query_condition.rs | 16 +- tiledb/oxidize/session/Cargo.toml | 19 + tiledb/oxidize/session/build.rs | 4 + tiledb/oxidize/session/src/lib.rs | 101 ++++ .../staticlibs/core-objects/Cargo.toml | 1 + .../staticlibs/core-objects/src/lib.rs | 1 + tiledb/sm/c_api/tiledb.cc | 25 + tiledb/sm/c_api/tiledb_experimental.h | 28 + tiledb/sm/cpp_api/query.h | 17 + tiledb/sm/cpp_api/query_experimental.h | 18 + tiledb/sm/query/query.cc | 90 +++- tiledb/sm/query/query.h | 19 + tiledb/sm/query/query_condition.cc | 64 ++- tiledb/sm/query/query_condition.h | 22 + .../sm/storage_manager/context_resources.cc | 10 +- tiledb/sm/storage_manager/context_resources.h | 18 + 37 files changed, 2005 insertions(+), 126 deletions(-) create mode 100644 examples/c_api/query_add_predicate.c create mode 100644 test/src/unit-query-add-predicate.cc create mode 100644 test/support/src/query_helpers.cc create mode 100644 test/support/src/query_helpers.h create mode 100644 tiledb/oxidize/session/Cargo.toml create mode 100644 tiledb/oxidize/session/build.rs create mode 100644 tiledb/oxidize/session/src/lib.rs diff --git a/examples/c_api/query_add_predicate.c b/examples/c_api/query_add_predicate.c new file mode 100644 index 00000000000..42a3ab1f87d --- /dev/null +++ b/examples/c_api/query_add_predicate.c @@ -0,0 +1,502 @@ +/** + * @file query_condition_sparse.c + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2022 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This example demonstrates using the `tiledb_query_add_predicate` API + * to add one or more text predicates to a query. This API parses a SQL + * predicate and uses it to filter results inside of the storage engine + * before returning them to the user. + * + * The array used in this example is identical to that of the + * `query_condition_sparse` example. The first group of predicates which + * run are text equivalents of the predicates in that example, and produce + * the same results. + * + * This example also has additional queries which use predicates which + * combine dimensions and attributes. + */ + +#include +#include +#include +#include +#include +#include +#include + +// Name of array. +const char* array_name = "array_query_add_predicate"; + +#define TRY(ctx, action) \ + do { \ + const capi_return_t r = (action); \ + if (r != TILEDB_OK) { \ + return print_last_error((ctx), r); \ + } \ + } while (0) + +#define RETURN_IF_NOT_OK(r) \ + do { \ + const int32_t status = (r); \ + if (status != TILEDB_OK) { \ + return status; \ + } \ + } while (0) + +/** + * Enumeration variants + */ +static const char* const states[] = { + "alabama", + "alaska", + "arizona", + "arkansas", + "california", + "colorado", + "connecticut", + "etc"}; + +/** + * @brief Function to print the values of all the attributes for one + * index of this array. + * + * @param a Attribute a's value. + * @param b Attribute b's value. + * @param c Attribute c's value. + * @param d Attribute d's value. + */ +void print_elem( + int* a, char* b_start, int b_len, int32_t c, float d, uint8_t* e) { + char print_a[8], print_e[16]; + if (a == NULL) { + strcpy(&print_a[0], "null"); + } else { + sprintf(&print_a[0], "%d", *a); + } + if (e == NULL) { + strcpy(&print_e[0], "null"); + } else if (*e < sizeof(states) / sizeof(const char*)) { + strcpy(&print_e[0], states[*e]); + } else { + sprintf(&print_e[0], "(invalid key %hhu)", *e); + } + + printf("{%s, %.*s, %d, %.1f, %s}\n", print_a, b_len, b_start, c, d, print_e); +} + +/** + * Retrieve and print the last error. + * + * @param ctx The context object to get the error from. + */ +int32_t print_last_error(tiledb_ctx_t* ctx, int32_t rc) { + tiledb_error_t* err = NULL; + tiledb_ctx_get_last_error(ctx, &err); + if (err == NULL) { + fprintf(stderr, "TileDB Error: Error code returned but no error found."); + return rc; + } + const char* msg = NULL; + tiledb_error_message(err, &msg); + if (msg == NULL) { + fprintf(stderr, "TileDB Error"); + } else { + fprintf(stderr, "%s\n", msg); + } + return rc; +} + +/** + * @brief Function to create the TileDB array used in this example. + * The array will be 1D with size 1 with dimension "index". + * The bounds on the index will be 0 through 9, inclusive. + * + * The array has two attributes. The two attributes are + * - "a" (type int) + * - "b" (type ASCII string) + * - "c" (type int32_t) + * - "d" (type float) + * + * @param ctx The context. + */ +int32_t create_array(tiledb_ctx_t* ctx) { + // Creating the dimension and the domain. + tiledb_dimension_t* dimension; + int dim_domain[] = {0, 9}; + int tile_extents[] = {1}; + TRY(ctx, + tiledb_dimension_alloc( + ctx, + "index", + TILEDB_INT32, + &dim_domain[0], + &tile_extents[0], + &dimension)); + + tiledb_domain_t* domain; + TRY(ctx, tiledb_domain_alloc(ctx, &domain)); + TRY(ctx, tiledb_domain_add_dimension(ctx, domain, dimension)); + + // The array will be sparse. + tiledb_array_schema_t* schema; + TRY(ctx, tiledb_array_schema_alloc(ctx, TILEDB_SPARSE, &schema)); + TRY(ctx, tiledb_array_schema_set_domain(ctx, schema, domain)); + TRY(ctx, tiledb_array_schema_set_cell_order(ctx, schema, TILEDB_ROW_MAJOR)); + + // Create enumeration + size_t states_size = 0; + for (uint64_t i = 0; i < sizeof(states) / sizeof(const char*); i++) { + states_size += strlen(states[i]); + } + const uint64_t states_offsets_size = + (sizeof(states) / sizeof(const char*)) * sizeof(uint64_t); + + char* states_values = (char*)(malloc(states_size)); + uint64_t* states_offsets = (uint64_t*)(malloc(states_offsets_size)); + + states_size = 0; + for (uint64_t i = 0; i < sizeof(states) / sizeof(const char*); i++) { + const uint64_t slen = strlen(states[i]); + memcpy(&states_values[states_size], &states[i][0], slen); + states_offsets[i] = states_size; + states_size += slen; + } + tiledb_enumeration_t* enumeration_states; + TRY(ctx, + tiledb_enumeration_alloc( + ctx, + "us_states", + TILEDB_STRING_ASCII, + UINT32_MAX, + false, + states_values, + states_size, + states_offsets, + states_offsets_size, + &enumeration_states)); + free(states_offsets); + free(states_values); + + TRY(ctx, + tiledb_array_schema_add_enumeration(ctx, schema, enumeration_states)); + + // Adding the attributes of the array to the array schema. + tiledb_attribute_t* a; + TRY(ctx, tiledb_attribute_alloc(ctx, "a", TILEDB_INT32, &a)); + TRY(ctx, tiledb_attribute_set_nullable(ctx, a, true)); + + tiledb_attribute_t* b; + TRY(ctx, tiledb_attribute_alloc(ctx, "b", TILEDB_STRING_ASCII, &b)); + TRY(ctx, tiledb_attribute_set_cell_val_num(ctx, b, TILEDB_VAR_NUM)); + + tiledb_attribute_t* c; + TRY(ctx, tiledb_attribute_alloc(ctx, "c", TILEDB_INT32, &c)); + + tiledb_attribute_t* d; + TRY(ctx, tiledb_attribute_alloc(ctx, "d", TILEDB_FLOAT32, &d)); + + tiledb_attribute_t* e; + TRY(ctx, tiledb_attribute_alloc(ctx, "e", TILEDB_UINT8, &e)); + TRY(ctx, tiledb_attribute_set_nullable(ctx, e, true)); + TRY(ctx, tiledb_attribute_set_enumeration_name(ctx, e, "us_states")); + + TRY(ctx, tiledb_array_schema_add_attribute(ctx, schema, a)); + TRY(ctx, tiledb_array_schema_add_attribute(ctx, schema, b)); + TRY(ctx, tiledb_array_schema_add_attribute(ctx, schema, c)); + TRY(ctx, tiledb_array_schema_add_attribute(ctx, schema, d)); + TRY(ctx, tiledb_array_schema_add_attribute(ctx, schema, e)); + + // Create the (empty) array. + TRY(ctx, tiledb_array_create(ctx, array_name, schema)); + + // Cleanup. + tiledb_attribute_free(&e); + tiledb_attribute_free(&d); + tiledb_attribute_free(&c); + tiledb_attribute_free(&b); + tiledb_attribute_free(&a); + tiledb_array_schema_free(&schema); + tiledb_domain_free(&domain); + tiledb_dimension_free(&dimension); + + return TILEDB_OK; +} + +/** + * @brief Execute a write on array query_condition_sparse array + * which then stores the following data in the array. The table + * is organized by dimension/attribute. + * + * index | a | b | c | d | e + * ------+------+-------+---+-----+------------ + * 0 | null | alice | 0 | 4.1 | arizona + * 1 | 2 | bob | 0 | 3.4 | etc + * 2 | null | craig | 0 | 5.6 | connecticut + * 3 | 4 | dave | 0 | 3.7 | colorado + * 4 | null | erin | 0 | 2.3 | null + * 5 | 6 | frank | 0 | 1.7 | arkansas + * 6 | null | grace | 1 | 3.8 | etc + * 7 | 8 | heidi | 2 | 4.9 | etc + * 8 | null | ivan | 3 | 3.2 | colorado + * 9 | 10 | judy | 4 | 3.1 | california + * + * @param ctx The context. + */ +int32_t write_array(tiledb_ctx_t* ctx) { + // Create data buffers that store the values to be written in. + int dim_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + uint64_t dim_size = sizeof(dim_data); + int32_t a_data[] = {0, 2, 0, 4, 0, 6, 0, 8, 0, 10}; + uint64_t a_size = sizeof(a_data); + uint8_t a_data_validity[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; + uint64_t a_validity_size = sizeof(a_data_validity); + char* b_data = "alicebobcraigdaveerinfrankgraceheidiivanjudy"; + uint64_t b_size = strlen(b_data); + uint64_t b_data_offsets[] = {0, 5, 8, 13, 17, 21, 26, 31, 36, 40}; + uint64_t b_offsets_size = sizeof(b_data_offsets); + int32_t c_data[] = {0, 0, 0, 0, 0, 0, 1, 2, 3, 4}; + uint64_t c_size = sizeof(c_data); + float d_data[] = {4.1, 3.4, 5.6, 3.7, 2.3, 1.7, 3.8, 4.9, 3.2, 3.1}; + uint64_t d_size = sizeof(d_data); + uint8_t e_data[] = {2, 7, 5, 6, 100, 3, 7, 7, 5, 4}; + uint64_t e_size = sizeof(e_data); + uint8_t e_validity[] = {1, 1, 1, 1, 0, 1, 1, 1, 1, 1}; + uint64_t e_validity_size = sizeof(e_validity); + + tiledb_array_t* array_w; + TRY(ctx, tiledb_array_alloc(ctx, array_name, &array_w)); + TRY(ctx, tiledb_array_open(ctx, array_w, TILEDB_WRITE)); + + // Execute the write query. + tiledb_query_t* query_w; + TRY(ctx, tiledb_query_alloc(ctx, array_w, TILEDB_WRITE, &query_w)); + TRY(ctx, tiledb_query_set_layout(ctx, query_w, TILEDB_UNORDERED)); + TRY(ctx, + tiledb_query_set_data_buffer(ctx, query_w, "index", dim_data, &dim_size)); + TRY(ctx, tiledb_query_set_data_buffer(ctx, query_w, "a", a_data, &a_size)); + TRY(ctx, + tiledb_query_set_validity_buffer( + ctx, query_w, "a", a_data_validity, &a_validity_size)); + TRY(ctx, tiledb_query_set_data_buffer(ctx, query_w, "b", b_data, &b_size)); + TRY(ctx, + tiledb_query_set_offsets_buffer( + ctx, query_w, "b", b_data_offsets, &b_offsets_size)); + TRY(ctx, tiledb_query_set_data_buffer(ctx, query_w, "c", c_data, &c_size)); + TRY(ctx, tiledb_query_set_data_buffer(ctx, query_w, "d", d_data, &d_size)); + TRY(ctx, tiledb_query_set_data_buffer(ctx, query_w, "e", e_data, &e_size)); + TRY(ctx, + tiledb_query_set_validity_buffer( + ctx, query_w, "e", e_validity, &e_validity_size)); + TRY(ctx, tiledb_query_submit(ctx, query_w)); + TRY(ctx, tiledb_query_finalize(ctx, query_w)); + TRY(ctx, tiledb_array_close(ctx, array_w)); + + tiledb_query_free(&query_w); + tiledb_array_free(&array_w); + + return TILEDB_OK; +} + +/** + * @brief Executes the read query for the array created in write_array. + * + * @param ctx The context. + * @param qc The query condition to execute the query with. + */ +int32_t read_array_with_predicates(tiledb_ctx_t* ctx, int num_predicates, ...) { + // Create data buffers to read the values into. + int a_data[10]; + uint64_t a_size = sizeof(a_data); + uint8_t a_data_validity[10]; + uint64_t a_validity_size = sizeof(a_data_validity); + + // We initialize the string b_data to contain 45 characters because + // that is the combined size of all strings in attribute b. + char b_data[256]; + memset(b_data, 0, 256); + uint64_t b_size = sizeof(b_data); + uint64_t b_data_offsets[10]; + uint64_t b_offsets_size = sizeof(b_data_offsets); + + int32_t c_data[10]; + uint64_t c_size = sizeof(c_data); + float d_data[10]; + uint64_t d_size = sizeof(d_data); + + uint8_t e_data[10]; + uint64_t e_size = sizeof(e_data); + uint8_t e_validity[10]; + uint64_t e_validity_size = sizeof(e_validity); + + tiledb_array_t* array; + TRY(ctx, tiledb_array_alloc(ctx, array_name, &array)); + TRY(ctx, tiledb_array_open(ctx, array, TILEDB_READ)); + + // Execute the read query. + tiledb_query_t* query; + TRY(ctx, tiledb_query_alloc(ctx, array, TILEDB_READ, &query)); + TRY(ctx, tiledb_query_set_layout(ctx, query, TILEDB_GLOBAL_ORDER)); + TRY(ctx, tiledb_query_set_data_buffer(ctx, query, "a", a_data, &a_size)); + TRY(ctx, + tiledb_query_set_validity_buffer( + ctx, query, "a", a_data_validity, &a_validity_size)); + TRY(ctx, tiledb_query_set_data_buffer(ctx, query, "b", b_data, &b_size)); + TRY(ctx, + tiledb_query_set_offsets_buffer( + ctx, query, "b", b_data_offsets, &b_offsets_size)); + TRY(ctx, tiledb_query_set_data_buffer(ctx, query, "c", c_data, &c_size)); + TRY(ctx, tiledb_query_set_data_buffer(ctx, query, "d", d_data, &d_size)); + TRY(ctx, tiledb_query_set_data_buffer(ctx, query, "e", e_data, &e_size)); + TRY(ctx, + tiledb_query_set_validity_buffer( + ctx, query, "e", e_validity, &e_validity_size)); + + va_list predicates; + va_start(predicates, num_predicates); + for (int i = 0; i < num_predicates; i++) { + const char* predicate = va_arg(predicates, const char*); + TRY(ctx, tiledb_query_add_predicate(ctx, query, predicate)); + } + va_end(predicates); + + TRY(ctx, tiledb_query_submit(ctx, query)); + + // Collect the results of the read query. The number of elements + // the filtered array contains is calculated by determining the + // number of valid elements in c_data, since the array is + // sparse. The length of the filtered substring of all the + // data is in b_data, and all the offsets for filtered + // individual elements are in b_data_offsets. + + // Here we print all the elements that are returned by the query. + uint64_t result_num = c_size / sizeof(int); + for (uint64_t i = 0; i < result_num; ++i) { + uint64_t element_start = b_data_offsets[i]; + uint64_t element_length = (i == result_num - 1) ? + (b_size / sizeof(char)) - element_start : + b_data_offsets[i + 1] - element_start; + print_elem( + a_data_validity[i] ? &a_data[i] : NULL, + b_data + element_start, + element_length, + c_data[i], + d_data[i], + e_validity[i] ? &e_data[i] : NULL); + } + + TRY(ctx, tiledb_query_finalize(ctx, query)); + TRY(ctx, tiledb_array_close(ctx, array)); + + tiledb_query_free(&query); + tiledb_array_free(&array); + + return TILEDB_OK; +} + +int32_t read_array_with_predicate(tiledb_ctx_t* ctx, const char* predicate) { + return read_array_with_predicates(ctx, 1, predicate); +} + +int main() { + // Create the context. + tiledb_ctx_t* ctx; + tiledb_ctx_alloc(NULL, &ctx); + + tiledb_vfs_t* vfs; + tiledb_vfs_alloc(ctx, NULL, &vfs); + + int32_t is_dir = 0; + tiledb_vfs_is_dir(ctx, vfs, array_name, &is_dir); + if (!is_dir) { + // Create and write data to the array. + RETURN_IF_NOT_OK(create_array(ctx)); + RETURN_IF_NOT_OK(write_array(ctx)); + } + + // EXAMPLES FROM query_condition_sparse.c EXAMPLE + + // Printing the entire array. + printf("WHERE TRUE\n"); + RETURN_IF_NOT_OK(read_array_with_predicates(ctx, 0)); + printf("\n"); + + // Execute a read query with query condition `a = null`. + printf("WHERE a IS NULL\n"); + RETURN_IF_NOT_OK(read_array_with_predicate(ctx, "a IS NULL")); + printf("\n"); + + // Execute a read query with query condition `b < "eve"`. + printf("SELECT * WHERE b < 'eve'\n"); + RETURN_IF_NOT_OK(read_array_with_predicate(ctx, "b < 'eve'")); + printf("\n"); + + // Execute a read query with query condition `c >= 1`. + printf("SELECT * WHERE c >= 1\n"); + RETURN_IF_NOT_OK(read_array_with_predicate(ctx, "c >= 1")); + printf("\n"); + + // Execute a read query with query condition `3.0f <= d AND d <= 4.0f`. + printf("WHERE d BETWEEN 3.0 AND 4.0\n"); + RETURN_IF_NOT_OK(read_array_with_predicate(ctx, "d BETWEEN 3.0 AND 4.0")); + printf("\n"); + + // Execute a read query with query condition `3.0f <= d AND d <= 4.0f AND a != + // null AND b < \"eve\"`. + printf("WHERE (d BETWEEN 3.0 AND 4.0) AND a IS NOT NULL AND b < 'eve'\n"); + RETURN_IF_NOT_OK(read_array_with_predicates( + ctx, 3, "d BETWEEN 3.0 AND 4.0", "a IS NOT NULL", "b < 'eve'")); + printf("\n"); + + // BEGIN EXAMPLES WITH ENUMERATIONS + printf("WHERE e = 'california'\n"); + { + // error is expected since the enumeration is not loaded + const int32_t ret = read_array_with_predicate(ctx, "e = 'california'"); + if (ret != TILEDB_ERR) { + return TILEDB_ERR; + } + } + printf("\n"); + + // BEGIN EXAMPLES WITH NO EQUIVALENT + + // query condition does not have functions, here we use coalesce + printf("WHERE coalesce(a, 2) + c < index\n"); + RETURN_IF_NOT_OK( + read_array_with_predicate(ctx, "coalesce(a, 2) + c < index")); + printf("\n"); + + // FIXME: this is query-condition-able, use arithmetic + printf("WHERE a > 6 OR a IS NULL\n"); + RETURN_IF_NOT_OK(read_array_with_predicate(ctx, "a > 6 OR a IS NULL")); + printf("\n"); + + tiledb_ctx_free(&ctx); + + return 0; +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 92595cdf557..8aec5f5b8b0 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -114,6 +114,7 @@ set(TILEDB_UNIT_TEST_SOURCES src/unit-ordered-dim-label-reader.cc src/unit-tile-metadata.cc src/unit-tile-metadata-generator.cc + src/unit-query-add-predicate.cc src/unit-query-plan.cc src/unit-ReadCellSlabIter.cc src/unit-Reader.cc diff --git a/test/src/unit-query-add-predicate.cc b/test/src/unit-query-add-predicate.cc new file mode 100644 index 00000000000..bd9fc2993a5 --- /dev/null +++ b/test/src/unit-query-add-predicate.cc @@ -0,0 +1,442 @@ +/** + * @file unit-capi-query-add-predicate.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2025 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * Tests for the C API tiledb_query_add_predicate. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "test/support/src/array_templates.h" +#include "test/support/src/error_helpers.h" +#include "test/support/src/helpers.h" +#include "test/support/src/vfs_helpers.h" +#include "tiledb/api/c_api/array/array_api_internal.h" +#include "tiledb/sm/cpp_api/tiledb" +#include "tiledb/sm/cpp_api/tiledb_experimental" + +using namespace tiledb; +using namespace tiledb::test; + +// no rapidcheck +using Asserter = AsserterCatch; + +// query result type for the array schema used in these tests +using Cells = templates::Fragment2D< + uint64_t, + uint64_t, + std::optional, + std::vector, + std::optional>; + +struct QueryAddPredicateFx { + VFSTestSetup vfs_test_setup_; + + Context context() const { + return vfs_test_setup_.ctx(); + } + + /** + * Creates and writes a two-dimension array with attributes: + * - 'a INT32' + * - 'v VARCHAR NOT NULL' + * - 'e UINT8:VARCHAR' + */ + void create_array(const std::string& path, tiledb_array_type_t atype); + + /** + * Writes cells to saturate the ranges [[1, 4], [1, 4]] for an array + * of the schema given above + */ + void write_array(const std::string& path, tiledb_array_type_t atype); + + Cells query_array( + const std::string& path, + tiledb_layout_t layout, + std::vector predicates); + + Cells query_array( + const std::string& path, tiledb_layout_t layout, const char* predicate) { + return query_array(path, layout, std::vector{predicate}); + } + + static const Cells INPUT; +}; + +const Cells QueryAddPredicateFx::INPUT = Cells{ + .d1_ = templates::query_buffers( + {1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4}), + .d2_ = templates::query_buffers( + {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4}), + .atts_ = std::make_tuple( + templates::query_buffers>( + std::vector>{ + 15, + std::nullopt, + std::nullopt, + 12, + std::nullopt, + 10, + 9, + std::nullopt, + 7, + 6, + 5, + 4, + std::nullopt, + 2, + 1, + 0}), + templates::query_buffers(std::vector{ + "one", + "two", + "three", + "four", + "five", + "six", + "seven", + "eight", + "nine", + "ten", + "eleven", + "twelve", + "thirteen", + "fourteen", + "fifteen", + "sixteen"}), + templates::query_buffers>( + std::vector>{ + 4, + 4, + 7, + std::nullopt, + 7, + 7, + std::nullopt, + 0, + 1, + std::nullopt, + 3, + 4, + std::nullopt, + 6, + 7, + std::nullopt}))}; + +void QueryAddPredicateFx::create_array( + const std::string& path, tiledb_array_type_t atype) { + auto ctx = context(); + + Domain domain(ctx); + domain.add_dimension(Dimension::create(ctx, "row", {{1, 4}}, 4)); + domain.add_dimension(Dimension::create(ctx, "col", {{1, 4}}, 4)); + + ArraySchema schema(ctx, atype); + schema.set_tile_order(TILEDB_ROW_MAJOR); + schema.set_cell_order(TILEDB_ROW_MAJOR); + schema.set_domain(domain); + + schema.add_attribute(Attribute::create(ctx, "a").set_nullable(true)); + schema.add_attribute(Attribute::create(ctx, "v")); + + // enumerated attribute + std::vector us_states = { + "alabama", + "alaska", + "arizona", + "arkansas", + "california", + "colorado", + "connecticut", + "etc"}; + ArraySchemaExperimental::add_enumeration( + ctx, + schema, + Enumeration::create(ctx, std::string("us_states"), us_states)); + { + auto e = Attribute::create(ctx, "e").set_nullable(true); + AttributeExperimental::set_enumeration_name(ctx, e, "us_states"); + schema.add_attribute(e); + } + + Array::create(path, schema); +} + +void QueryAddPredicateFx::write_array( + const std::string& path, tiledb_array_type_t atype) { + auto ctx = context(); + Array array(ctx, path, TILEDB_WRITE); + Query query(ctx, array); + + if (atype == TILEDB_DENSE) { + Subarray s(ctx, array); + s.add_range(0, 1, 4); + s.add_range(1, 1, 4); + query.set_layout(TILEDB_ROW_MAJOR).set_subarray(s); + + templates::Fragment< + std::optional, + std::vector, + std::optional> + cells = {.atts_ = INPUT.atts_}; + + auto field_sizes = templates::query::make_field_sizes(cells); + templates::query::set_fields( + ctx.ptr().get(), + query.ptr().get(), + field_sizes, + cells, + array.ptr().get()->array_schema_latest()); + + query.submit(); + } else { + auto field_sizes = + templates::query::make_field_sizes(const_cast(INPUT)); + templates::query::set_fields( + ctx.ptr().get(), + query.ptr().get(), + field_sizes, + const_cast(INPUT), + array.ptr().get()->array_schema_latest()); + query.submit(); + } +} + +Cells QueryAddPredicateFx::query_array( + const std::string& path, + tiledb_layout_t layout, + std::vector predicates) { + auto ctx = context(); + + Array array(ctx, path, TILEDB_READ); + Query query(ctx, array); + + query.set_layout(layout); + + Cells out; + out.resize(32); + + auto field_sizes = + templates::query::make_field_sizes(out, out.size()); + + templates::query::set_fields( + ctx.ptr().get(), + query.ptr().get(), + field_sizes, + out, + array.ptr().get()->array_schema_latest()); + + for (const char* pred : predicates) { + QueryExperimental::add_predicate(ctx, query, pred); + } + + if (array.schema().array_type() == TILEDB_DENSE) { + Subarray s(ctx, array); + s.add_range(0, 1, 4); + s.add_range(1, 1, 4); + query.set_subarray(s); + } + + const auto st = query.submit(); + REQUIRE(st == Query::Status::COMPLETE); + + templates::query::resize_fields(out, field_sizes); + + return out; +} + +TEST_CASE_METHOD( + QueryAddPredicateFx, + "C API: Test query add predicate errors", + "[capi][query][add_predicate]") { + const std::string array_name = + vfs_test_setup_.array_uri("test_qeury_add_predicate_errors"); + + create_array(array_name, TILEDB_SPARSE); + write_array(array_name, TILEDB_SPARSE); + + auto ctx = context(); + + SECTION("Non-read query errors") { + Array array(ctx, array_name, TILEDB_WRITE); + Query query(ctx, array); + + REQUIRE_THROWS_WITH( + QueryExperimental::add_predicate(ctx, query, "row BETWEEN 4 AND 7"), + Catch::Matchers::ContainsSubstring( + "Cannot add query predicate; Operation only applicable to read " + "queries")); + } + + SECTION("Read query errors") { + Array array(ctx, array_name, TILEDB_READ); + Query query(ctx, array); + + SECTION("Null") { + REQUIRE_THROWS_WITH( + QueryExperimental::add_predicate(ctx, query, nullptr), + Catch::Matchers::ContainsSubstring( + "Argument \"predicate\" may not be NULL")); + } + + SECTION("Syntax error") { + // FIXME: this smells like a bug in datafusion. + // If you dbg! the returned expr it prints `Expr::Column(Column { name: + // "row" })` + REQUIRE_THROWS_WITH( + QueryExperimental::add_predicate(ctx, query, "row col"), + Catch::Matchers::ContainsSubstring( + "Error: Expression does not return a boolean value")); + } + + SECTION("Non-expression") { + REQUIRE_THROWS_WITH( + QueryExperimental::add_predicate( + ctx, query, "CREATE TABLE foo (id INT)"), + Catch::Matchers::ContainsSubstring( + "Error adding predicate: Parse error: SQL error: " + "ParserError(\"Unsupported command in expression\")")); + } + + SECTION("Not a predicate") { + REQUIRE_THROWS_WITH( + QueryExperimental::add_predicate(ctx, query, "row"), + Catch::Matchers::ContainsSubstring( + "Expression does not return a boolean value")); + } + + SECTION("Schema error") { + REQUIRE_THROWS_WITH( + QueryExperimental::add_predicate(ctx, query, "depth = 3"), + Catch::Matchers::ContainsSubstring( + "Error adding predicate: Parse error: Schema error: No field " + "named depth. Valid fields are row, col, a, v, e.")); + } + + SECTION("Type coercion failure") { + // FIXME: from the tables CLI this gives a very different error which is + // more user-friendly, there must be some optimization pass which we are + // not doing + const std::string dferror = + "Error adding predicate: Type coercion error: Internal error: Expect " + "TypeSignatureClass::Native(LogicalType(Native(String), String)) but " + "received NativeType::UInt64, DataType: UInt64.\nThis was likely " + "caused by a bug in DataFusion's code and we would welcome that you " + "file an bug report in our issue tracker"; + REQUIRE_THROWS_WITH( + QueryExperimental::add_predicate(ctx, query, "starts_with(row, '1')"), + Catch::Matchers::ContainsSubstring(dferror)); + } + + SECTION("Aggregate") { + REQUIRE_THROWS_WITH( + QueryExperimental::add_predicate(ctx, query, "sum(row) >= 10"), + Catch::Matchers::ContainsSubstring( + "Aggregate functions in predicate is not supported")); + } + } +} + +TEST_CASE_METHOD( + QueryAddPredicateFx, + "C API: Test query add predicate dense", + "[capi][query][add_predicate]") { + const std::string array_name = + vfs_test_setup_.array_uri("test_qeury_add_predicate_dense"); + + create_array(array_name, TILEDB_DENSE); + write_array(array_name, TILEDB_DENSE); + + // FIXME: error messages + REQUIRE_THROWS(query_array(array_name, TILEDB_UNORDERED, "row >= 3")); + REQUIRE_THROWS(query_array(array_name, TILEDB_ROW_MAJOR, "row >= 3")); + REQUIRE_THROWS(query_array(array_name, TILEDB_COL_MAJOR, "row >= 3")); + REQUIRE_THROWS(query_array(array_name, TILEDB_GLOBAL_ORDER, "row >= 3")); + REQUIRE_THROWS(query_array(array_name, TILEDB_HILBERT, "row >= 3")); +} + +TEST_CASE_METHOD( + QueryAddPredicateFx, + "C API: Test query add predicate legacy", + "[capi][query][add_predicate]") { + const std::string array_name = + vfs_test_setup_.array_uri("test_qeury_add_predicate_legacy"); + // TODO +} + +TEST_CASE_METHOD( + QueryAddPredicateFx, + "C API: Test query add predicate sparse unsupported query order", + "[capi][query][add_predicate]") { + const std::string array_name = + vfs_test_setup_.array_uri("test_qeury_add_predicate_sparse_unsupported"); + + create_array(array_name, TILEDB_SPARSE); + write_array(array_name, TILEDB_SPARSE); + // TODO +} + +TEST_CASE_METHOD( + QueryAddPredicateFx, + "C API: Test query add predicate sparse global order", + "[capi][query][add_predicate]") { + const std::string array_name = + vfs_test_setup_.array_uri("test_qeury_add_predicate_sparse_global_order"); + + create_array(array_name, TILEDB_SPARSE); + write_array(array_name, TILEDB_SPARSE); + + SECTION("WHERE TRUE") { + const auto result = query_array(array_name, TILEDB_GLOBAL_ORDER, "TRUE"); + CHECK(result == INPUT); + } + + SECTION("WHERE a IS NULL") { + // TODO + } + + SECTION("WHERE b < 'fourteen'") { + // TODO + } + + SECTION("WHERE row + col <= 4") { + // TODO + } + + SECTION("WHERE coalesce(a, row) > a") { + // TODO + } +} diff --git a/test/src/unit-sparse-global-order-reader.cc b/test/src/unit-sparse-global-order-reader.cc index 0046caae331..b6c2a2262ac 100644 --- a/test/src/unit-sparse-global-order-reader.cc +++ b/test/src/unit-sparse-global-order-reader.cc @@ -36,6 +36,7 @@ #include "test/support/src/array_templates.h" #include "test/support/src/error_helpers.h" #include "test/support/src/helpers.h" +#include "test/support/src/query_helpers.h" #include "test/support/src/vfs_helpers.h" #include "tiledb/api/c_api/array/array_api_internal.h" #include "tiledb/sm/c_api/tiledb.h" @@ -147,6 +148,7 @@ struct FxRun1D { // for evaluating std::optional> condition; + bool condition_use_datafusion = false; DefaultArray1DConfig array; SparseGlobalOrderReaderMemoryBudget memory; @@ -258,6 +260,7 @@ struct FxRun2D { std::optional>>> subarray; std::optional> condition; + bool condition_use_datafusion; size_t num_user_cells; @@ -271,7 +274,8 @@ struct FxRun2D { SparseGlobalOrderReaderMemoryBudget memory; FxRun2D() - : capacity(64) + : condition_use_datafusion(false) + , capacity(64) , allow_dups(true) , tile_order_(TILEDB_ROW_MAJOR) , cell_order_(TILEDB_ROW_MAJOR) { @@ -509,7 +513,8 @@ struct CSparseGlobalOrderFx { template DeleteArrayGuard run_create(Instance& instance); template - void run_execute(Instance& instance); + std::optional run_execute( + Instance& instance); /** * Runs an input against a fresh array. @@ -517,7 +522,7 @@ struct CSparseGlobalOrderFx { * and checks that what we read out matches what we put in. */ template - void run(Instance& instance); + std::optional run(Instance& instance); template std::optional error_if_any(CAPIReturn apirc) const; @@ -3401,11 +3406,12 @@ void CSparseGlobalOrderFx::create_array(const Instance& instance) { * expected result order computed from the input data. */ template -void CSparseGlobalOrderFx::run(Instance& instance) { +std::optional CSparseGlobalOrderFx::run( + Instance& instance) { reset_config(); auto tmparray = run_create(instance); - run_execute(instance); + return run_execute(instance); } template @@ -3432,10 +3438,11 @@ DeleteArrayGuard CSparseGlobalOrderFx::run_create(Instance& instance) { } template -void CSparseGlobalOrderFx::run_execute(Instance& instance) { +std::optional +CSparseGlobalOrderFx::run_execute(Instance& instance) { ASSERTER(instance.num_user_cells > 0); - std::decay_t expect; + typename Instance::FragmentType expect; // for de-duplicating, track the fragment that each coordinate came from // we will use this to select the coordinate from the most recent fragment @@ -3453,12 +3460,6 @@ void CSparseGlobalOrderFx::run_execute(Instance& instance) { expect_fragment.insert(expect_fragment.end(), fragment.size(), f); } else { std::vector accept; - std::optional< - templates::QueryConditionEvalSchema> - eval; - if (instance.condition.has_value()) { - eval.emplace(); - } for (uint64_t i = 0; i < fragment.size(); i++) { if (!instance.pass_subarray(fragment, i)) { continue; @@ -3560,10 +3561,17 @@ void CSparseGlobalOrderFx::run_execute(Instance& instance) { } if (instance.condition.has_value()) { - tiledb::sm::QueryCondition qc(instance.condition->get()->clone()); - const auto rc = - query->query_->set_condition(qc); // SAFETY: this performs a deep copy - ASSERTER(rc.to_string() == "Ok"); + if (instance.condition_use_datafusion) { + const std::string sql = tiledb::test::to_sql( + *instance.condition.value().get(), + array->array()->array_schema_latest()); + TRY(context(), tiledb_query_add_predicate(context(), query, sql.c_str())); + } else { + tiledb::sm::QueryCondition qc(instance.condition->get()->clone()); + const auto rc = query->query_->set_condition( + qc); // SAFETY: this performs a deep copy + ASSERTER(rc.to_string() == "Ok"); + } } // Prepare output buffer @@ -3613,7 +3621,7 @@ void CSparseGlobalOrderFx::run_execute(Instance& instance) { } } tiledb_query_free(&query); - return; + return std::nullopt; } if (err->find("Cannot set array memory budget") != std::string::npos) { if (!vfs_test_setup_.is_rest()) { @@ -3626,7 +3634,7 @@ void CSparseGlobalOrderFx::run_execute(Instance& instance) { ASSERTER(array_usage > array_budget); } tiledb_query_free(&query); - return; + return std::nullopt; } if constexpr (std::is_same_v) { if (err->find( @@ -3636,13 +3644,13 @@ void CSparseGlobalOrderFx::run_execute(Instance& instance) { // we can probably make some assertions about what this should have // looked like but for now we'll let it go tiledb_query_free(&query); - return; + return std::nullopt; } if (err->find("Cannot load tile offsets") != std::string::npos) { // not enough memory budget for tile offsets, don't bother asserting // about it (for now?) tiledb_query_free(&query); - return; + return std::nullopt; } } } @@ -3706,15 +3714,7 @@ void CSparseGlobalOrderFx::run_execute(Instance& instance) { // Clean up. tiledb_query_free(&query); - std::apply( - [outcursor](auto&... outfield) { - std::apply( - [&](const auto&... field_cursor) { - (outfield.finish_multipart_read(field_cursor), ...); - }, - outcursor); - }, - std::tuple_cat(outdims, outatts)); + templates::query::resize_fields(out, outcursor); ASSERTER(expect.dimensions() == outdims); @@ -3788,6 +3788,8 @@ void CSparseGlobalOrderFx::run_execute(Instance& instance) { ASSERTER(can_complete.has_value()); } } + + return expect; } // rapidcheck generators and Arbitrary specializations @@ -3869,20 +3871,22 @@ struct Arbitrary> { auto num_user_cells = gen::inRange(1, 8 * 1024 * 1024); return gen::apply( - [](auto fragments, int num_user_cells) { + [](auto fragments, int num_user_cells, bool condition_use_datafusion) { FxRun1D instance; instance.array.allow_dups_ = std::get<0>(fragments); instance.array.dimension_ = std::get<1>(fragments); instance.subarray = std::get<2>(fragments); instance.fragments = std::move(std::get<3>(fragments).first); instance.condition = std::move(std::get<3>(fragments).second); + instance.condition_use_datafusion = condition_use_datafusion; instance.num_user_cells = num_user_cells; return instance; }, fragments, - num_user_cells); + num_user_cells, + gen::arbitrary()); } }; @@ -3978,7 +3982,8 @@ struct Arbitrary { [](auto fragments, int num_user_cells, tiledb_layout_t tile_order, - tiledb_layout_t cell_order) { + tiledb_layout_t cell_order, + bool condition_use_datafusion) { FxRun2D instance; instance.allow_dups = std::get<0>(fragments); instance.d1 = std::get<1>(fragments); @@ -3986,6 +3991,7 @@ struct Arbitrary { instance.subarray = std::get<3>(fragments); instance.fragments = std::move(std::get<4>(fragments).first); instance.condition = std::move(std::get<4>(fragments).second); + instance.condition_use_datafusion = condition_use_datafusion; // TODO: capacity instance.num_user_cells = num_user_cells; @@ -3997,7 +4003,8 @@ struct Arbitrary { fragments, num_user_cells, tile_order, - cell_order); + cell_order, + gen::arbitrary()); } }; diff --git a/test/support/CMakeLists.txt b/test/support/CMakeLists.txt index 6eb891a6dba..e834e710957 100644 --- a/test/support/CMakeLists.txt +++ b/test/support/CMakeLists.txt @@ -46,6 +46,7 @@ set(TILEDB_TEST_SUPPORT_SOURCES src/helpers-dimension.h src/mem_helpers.h src/mem_helpers.cc + src/query_helpers.cc src/serialization_wrappers.cc src/stats.cc src/temporary_local_directory.cc diff --git a/test/support/src/array_templates.h b/test/support/src/array_templates.h index 7858077b2b7..f1b085e44b6 100644 --- a/test/support/src/array_templates.h +++ b/test/support/src/array_templates.h @@ -36,6 +36,7 @@ #include "tiledb.h" #include "tiledb/common/unreachable.h" +#include "tiledb/sm/array_schema/array_schema.h" #include "tiledb/sm/query/ast/query_ast.h" #include "tiledb/type/datatype_traits.h" #include "tiledb/type/range/range.h" @@ -578,6 +579,18 @@ struct query_buffers> { query_buffers() { } + query_buffers(std::vector> cells) { + for (const auto& cell : cells) { + if (cell.has_value()) { + values_.push_back(cell.value()); + validity_.push_back(1); + } else { + values_.push_back(T()); + validity_.push_back(0); + } + } + } + query_buffers(const self_type& other) = default; bool operator==(const self_type& other) const = default; @@ -1125,6 +1138,56 @@ struct query_buffers>> { } }; +/** + * Specialization of `query_buffers` for variable-length non-nullable cells + * whose physical type is `char` and thus the "logical type" of each cell + * is `std::string`. + * + * See `query_buffers>`. + */ +template <> +struct query_buffers : public query_buffers> { + query_buffers(std::vector cells) { + for (const auto& cell : cells) { + offsets_.push_back(values_.size()); + values_.insert(values_.end(), cell.begin(), cell.end()); + } + } +}; + +template +struct Fragment { + std::tuple...> atts_; + + uint64_t size() const { + return std::get<0>(atts_).num_cells(); + } + + std::tuple<> dimensions() const { + return std::tuple<>(); + } + + std::tuple&...> attributes() const { + return std::apply( + [](const query_buffers&... attribute) { + return std::tuple&...>(attribute...); + }, + atts_); + } + + std::tuple<> dimensions() { + return std::tuple<>(); + } + + std::tuple&...> attributes() { + return std::apply( + [](query_buffers&... attribute) { + return std::tuple&...>(attribute...); + }, + atts_); + } +}; + /** * Data for a one-dimensional array */ @@ -1169,6 +1232,8 @@ struct Fragment1D { */ template struct Fragment2D { + using Self = Fragment2D; + query_buffers d1_; query_buffers d2_; std::tuple...> atts_; @@ -1177,6 +1242,22 @@ struct Fragment2D { return d1_.num_cells(); } + void reserve(uint64_t num_cells) { + d1_.reserve(num_cells); + d2_.reserve(num_cells); + std::apply( + [&](query_buffers&... att) { (att.reserve(num_cells), ...); }, + atts_); + } + + void resize(uint64_t num_cells) { + d1_.resize(num_cells); + d2_.resize(num_cells); + std::apply( + [&](query_buffers&... att) { (att.resize(num_cells), ...); }, + atts_); + } + std::tuple&, const query_buffers&> dimensions() const { return std::tuple&, const query_buffers&>( @@ -1202,6 +1283,8 @@ struct Fragment2D { }, atts_); } + + bool operator==(const Self& other) const = default; }; /** @@ -1369,6 +1452,25 @@ void set_fields( }(fragment.attributes()); } +template +void set_fields( + tiledb_ctx_t* ctx, + tiledb_query_t* query, + fragment_field_sizes_t& field_sizes, + F& fragment, + const tiledb::sm::ArraySchema& schema, + const fragment_field_sizes_t& field_cursors = + fragment_field_sizes_t()) { + std::function dim_name = [&](unsigned dim) { + return schema.domain().dimension_ptr(dim)->name(); + }; + std::function att_name = [&](unsigned att) { + return schema.attribute(att)->name(); + }; + return set_fields( + ctx, query, field_sizes, fragment, dim_name, att_name, field_cursors); +} + /** * @return the number of cells written into `fields` by a read query */ @@ -1379,6 +1481,19 @@ uint64_t num_cells(const F& fragment, const auto& field_sizes) { }(std::tuple_cat(fragment.dimensions(), fragment.attributes())); } +template +void resize_fields(F& fragment, const auto& field_sizes) { + std::apply( + [field_sizes](auto&... outfield) { + std::apply( + [&](const auto&... field_cursor) { + (outfield.finish_multipart_read(field_cursor), ...); + }, + field_sizes); + }, + std::tuple_cat(fragment.dimensions(), fragment.attributes())); +} + } // namespace query } // namespace tiledb::test::templates diff --git a/test/support/src/query_helpers.cc b/test/support/src/query_helpers.cc new file mode 100644 index 00000000000..de1bd690866 --- /dev/null +++ b/test/support/src/query_helpers.cc @@ -0,0 +1,102 @@ +/** + * @file query_helpers.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2025 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + */ +#include "test/support/src/query_helpers.h" +#include "tiledb/stdx/utility/to_underlying.h" +#include "tiledb/type/apply_with_type.h" + +#include + +namespace tiledb::test { + +using namespace tiledb::sm; + +static const char* to_sql_op(QueryConditionOp op) { + switch (op) { + case QueryConditionOp::LT: + return "<"; + case QueryConditionOp::LE: + return "<="; + case QueryConditionOp::EQ: + return "="; + case QueryConditionOp::GE: + return ">="; + case QueryConditionOp::GT: + return ">"; + case QueryConditionOp::NE: + return "<>"; + default: + throw std::logic_error( + "Invalid query condition op: " + + std::to_string(stdx::to_underlying(op))); + } +} + +std::string to_sql(const ASTNode& ast, const ArraySchema& schema) { + const ASTNodeVal* valnode = static_cast(&ast); + const ASTNodeExpr* exprnode = dynamic_cast(&ast); + + std::stringstream os; + if (valnode) { + const auto fname = valnode->get_field_name(); + const auto op = valnode->get_op(); + const auto bytes = valnode->get_data(); + + std::stringstream value; + + apply_with_type( + [&](auto t) { + using T = decltype(t); + value << *reinterpret_cast(bytes.data()); + }, + schema.type(fname)); + + os << fname << " " << to_sql_op(op) << " " << value.str(); + } else if (exprnode) { + const auto op = exprnode->get_combination_op(); + const auto& children = exprnode->get_children(); + if (op == QueryConditionCombinationOp::NOT) { + assert(children.size() == 1); + os << "NOT "; + } + for (unsigned i = 0; i < children.size(); i++) { + if (i != 0) { + os << " " << query_condition_combination_op_str(op) << " "; + } + os << "(" << to_sql(*children[i].get(), schema) << ")"; + } + } else { + throw std::logic_error( + "Invalid query condition syntax tree node: " + + std::string(typeid(ast).name())); + } + return os.str(); +} + +} // namespace tiledb::test diff --git a/test/support/src/query_helpers.h b/test/support/src/query_helpers.h new file mode 100644 index 00000000000..ae43767124b --- /dev/null +++ b/test/support/src/query_helpers.h @@ -0,0 +1,50 @@ +/** + * @file test/support/src/query_helpers.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2025 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + */ + +#ifndef TILEDB_TEST_SUPPORT_QUERY_CONDITION_H +#define TILEDB_TEST_SUPPORT_QUERY_CONDITION_H + +#include "tiledb/sm/array_schema/array_schema.h" +#include "tiledb/sm/query/ast/query_ast.h" + +#include + +namespace tiledb::test { + +/** + * @return a SQL representation of the query condition syntax tree + */ +std::string to_sql( + const tiledb::sm::ASTNode& ast, + const tiledb::sm::ArraySchema& array_schema); + +} // namespace tiledb::test + +#endif diff --git a/test/support/src/vfs_helpers.h b/test/support/src/vfs_helpers.h index c91fc4dc305..5a782af16f9 100644 --- a/test/support/src/vfs_helpers.h +++ b/test/support/src/vfs_helpers.h @@ -869,7 +869,7 @@ struct VFSTestSetup { } } - Context ctx() { + Context ctx() const { return Context(ctx_c, false); } diff --git a/tiledb/oxidize/CMakeLists.txt b/tiledb/oxidize/CMakeLists.txt index d63a0ae0d25..873b434dd89 100644 --- a/tiledb/oxidize/CMakeLists.txt +++ b/tiledb/oxidize/CMakeLists.txt @@ -40,7 +40,14 @@ cxxbridge( sm/query/ast/mod.rs sm/misc/mod.rs sm/tile/mod.rs -) +) + +cxxbridge( + NAME + session + SOURCES + lib.rs +) cxxbridge( NAME @@ -61,6 +68,7 @@ oxidize( arrow cxx-interface expr + session ) oxidize( diff --git a/tiledb/oxidize/Cargo.lock b/tiledb/oxidize/Cargo.lock index 76084f7ccee..30164d65803 100644 --- a/tiledb/oxidize/Cargo.lock +++ b/tiledb/oxidize/Cargo.lock @@ -2842,6 +2842,7 @@ version = "0.1.0" dependencies = [ "tiledb-arrow", "tiledb-expr", + "tiledb-session", ] [[package]] @@ -2865,6 +2866,7 @@ name = "tiledb-expr" version = "0.1.0" dependencies = [ "anyhow", + "arrow", "cxx", "cxx-build", "datafusion", @@ -2907,6 +2909,22 @@ name = "tiledb-proptest-config" version = "0.1.0" source = "git+https://github.com/TileDB-Inc/tiledb-rs.git?branch=main#e418936fff551dd608e2a1b5e3c557f4c8e5d29d" +[[package]] +name = "tiledb-session" +version = "0.1.0" +dependencies = [ + "anyhow", + "cxx", + "cxx-build", + "datafusion", + "itertools", + "num-traits", + "thiserror 2.0.12", + "tiledb-arrow", + "tiledb-cxx-interface", + "tiledb-expr", +] + [[package]] name = "tiledb-sys-defs" version = "0.1.0" diff --git a/tiledb/oxidize/Cargo.toml b/tiledb/oxidize/Cargo.toml index 9647dd66822..a65d844a55f 100644 --- a/tiledb/oxidize/Cargo.toml +++ b/tiledb/oxidize/Cargo.toml @@ -10,6 +10,7 @@ members = [ "staticlibs/core-objects", "staticlibs/unit-arithmetic", "staticlibs/unit-query-condition", + "session", "test-support/array-schema", "test-support/cxx-interface", "test-support/ffi", @@ -38,6 +39,7 @@ tiledb-cxx-interface = { path = "cxx-interface" } tiledb-datatype = { path = "datatype" } tiledb-expr = { path = "expr" } tiledb-pod = { git = "https://github.com/TileDB-Inc/tiledb-rs.git", branch = "main", features = [ "proptest-strategies" ] } +tiledb-session = { path = "session" } tiledb-test-array-schema = { path = "test-support/array-schema" } tiledb-test-cells = { package = "cells", git = "https://github.com/TileDB-Inc/tiledb-rs.git", branch = "main", features = [ "proptest-strategies" ] } tiledb-test-ffi = { path = "test-support/ffi" } diff --git a/tiledb/oxidize/arrow/src/lib.rs b/tiledb/oxidize/arrow/src/lib.rs index 79f7dbc2477..7f4eaa7e9d6 100644 --- a/tiledb/oxidize/arrow/src/lib.rs +++ b/tiledb/oxidize/arrow/src/lib.rs @@ -14,9 +14,12 @@ pub mod ffi { type ArrowSchema; #[cxx_name = "create"] - fn array_schema_to_arrow_schema( + fn array_schema_create_arrow_schema(schema: &ArraySchema) -> Result>; + + #[cxx_name = "project"] + fn array_schema_project_arrow_schema( schema: &ArraySchema, - select: &CxxVector, + select: &Vec, ) -> Result>; } @@ -37,7 +40,10 @@ pub mod record_batch; pub mod schema; use record_batch::{ArrowRecordBatch, to_record_batch as result_tile_to_record_batch}; -use schema::{ArrowSchema, cxx::to_arrow as array_schema_to_arrow_schema}; +use schema::{ + ArrowSchema, cxx::project_arrow as array_schema_project_arrow_schema, + cxx::to_arrow as array_schema_create_arrow_schema, +}; unsafe impl cxx::ExternType for ArrowRecordBatch { type Id = cxx::type_id!("tiledb::oxidize::arrow::record_batch::ArrowRecordBatch"); diff --git a/tiledb/oxidize/arrow/src/record_batch.rs b/tiledb/oxidize/arrow/src/record_batch.rs index 8ce7720e73c..9fcd22d272b 100644 --- a/tiledb/oxidize/arrow/src/record_batch.rs +++ b/tiledb/oxidize/arrow/src/record_batch.rs @@ -6,11 +6,12 @@ use std::sync::Arc; use arrow::array::{ - self as aa, Array as ArrowArray, FixedSizeListArray, GenericListArray, PrimitiveArray, + self as aa, Array as ArrowArray, FixedSizeListArray, GenericListArray, LargeStringArray, + PrimitiveArray, }; use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow::datatypes::{self as adt, ArrowPrimitiveType, Field}; -use arrow::record_batch::RecordBatch; +use arrow::record_batch::{RecordBatch, RecordBatchOptions}; use tiledb_cxx_interface::sm::query::readers::{ResultTile, TileTuple}; use tiledb_cxx_interface::sm::tile::Tile; @@ -39,6 +40,10 @@ pub enum FieldError { InternalUnalignedValues, #[error("Internal error: invalid variable-length data offsets: {0}")] InternalOffsets(#[from] OffsetsError), + #[error("Error reading tile: {0}")] + InvalidTileData(#[source] arrow::error::ArrowError), + #[error("Attributes with enumerations are not supported in text predicates")] + EnumerationNotSupported, } /// Wraps a [RecordBatch] for passing across the FFI boundary. @@ -117,8 +122,17 @@ pub unsafe fn to_record_batch( ); // SAFETY: the four asserts above rule out each of the possible error conditions - let arrow = RecordBatch::try_new(Arc::clone(&schema.0), columns) - .expect("Logic error: preconditions for constructing RecordBatch not met"); + let arrow = if columns.is_empty() { + RecordBatch::try_new_with_options( + Arc::clone(&schema.0), + columns, + &RecordBatchOptions::new().with_row_count(Some(tile.cell_num() as usize)), + ) + } else { + RecordBatch::try_new(Arc::clone(&schema.0), columns) + }; + + let arrow = arrow.expect("Logic error: preconditions for constructing RecordBatch not met"); Ok(Box::new(ArrowRecordBatch { arrow })) } @@ -221,6 +235,21 @@ unsafe fn to_arrow_array( null_buffer, ))) } + DataType::LargeUtf8 => { + let Some(var_tile) = var else { + return Err(FieldError::ExpectedVarTile); + }; + let offsets = crate::offsets::try_from_bytes(1, fixed.as_slice())?; + let values = unsafe { + // SAFETY: TODO add comment + to_buffer::(var_tile) + }?; + + Ok(Arc::new( + LargeStringArray::try_new(offsets, values.into(), null_buffer) + .map_err(FieldError::InvalidTileData)?, + )) + } DataType::LargeList(value_field) => { let Some(var_tile) = var else { return Err(FieldError::ExpectedVarTile); @@ -238,6 +267,13 @@ unsafe fn to_arrow_array( null_buffer, ))) } + DataType::Dictionary(_, _) => { + // NB: we will do this later, + // it will require some refactoring so that we build the enumeration + // ArrowArrays just once for the whole query, in addition to the + // issues with regards to the enumeration being loaded + return Err(FieldError::EnumerationNotSupported); + } _ => { // SAFETY: ensured by limited range of return values of `crate::schema::arrow_datatype` unreachable!( @@ -260,6 +296,23 @@ unsafe fn to_primitive_array( tile: &Tile, validity: Option, ) -> Result, FieldError> +where + T: ArrowPrimitiveType, +{ + let values = unsafe { + // SAFETY: TODO add comment + to_buffer::(tile) + }?; + Ok(Arc::new(PrimitiveArray::::new(values, validity)) as Arc) +} + +/// Returns a [Buffer] which refers to the data contained inside the [Tile]. +/// +/// # Safety +/// +/// This function is safe to call as long as the returned [Buffer] +/// is not used after the argument [Tile] is destructed. +unsafe fn to_buffer(tile: &Tile) -> Result, FieldError> where T: ArrowPrimitiveType, { @@ -270,31 +323,29 @@ where if !(prefix.is_empty() && suffix.is_empty()) { return Err(FieldError::InternalUnalignedValues); } - let tile_buffer = if let Some(ptr) = std::ptr::NonNull::new(values.as_ptr() as *mut u8) { - // SAFETY: - // - // `Buffer::from_custom_allocation` creates a buffer which refers to an existing - // memory region whose ownership is tracked by some `Arc`. - // `Allocation` is basically any type, whose `drop` implementation is responsible - // for freeing the memory. - // - // The tile memory which we reference lives on the `extern "C++"` side of the - // FFI boundary, as such we cannot use `Arc` to track its lifetime. - // - // As such: - // 1) we will use an object with trivial `drop` to set up the memory aliasing - // 2) there is an implicit lifetime requirement that the Tile must out-live - // this Buffer, else we shall suffer use after free - // 3) the caller is responsible for upholding that guarantee - unsafe { Buffer::from_custom_allocation(ptr, tile.size() as usize, Arc::new(())) } - } else { - Buffer::from_vec(Vec::::new()) - }; - Ok(Arc::new(PrimitiveArray::::new( - ScalarBuffer::from(tile_buffer), - validity, - )) as Arc) + Ok(ScalarBuffer::::from( + if let Some(ptr) = std::ptr::NonNull::new(values.as_ptr() as *mut u8) { + // SAFETY: + // + // `Buffer::from_custom_allocation` creates a buffer which refers to an existing + // memory region whose ownership is tracked by some `Arc`. + // `Allocation` is basically any type, whose `drop` implementation is responsible + // for freeing the memory. + // + // The tile memory which we reference lives on the `extern "C++"` side of the + // FFI boundary, as such we cannot use `Arc` to track its lifetime. + // + // As such: + // 1) we will use an object with trivial `drop` to set up the memory aliasing + // 2) there is an implicit lifetime requirement that the Tile must out-live + // this Buffer, else we shall suffer use after free + // 3) the caller is responsible for upholding that guarantee + unsafe { Buffer::from_custom_allocation(ptr, tile.size() as usize, Arc::new(())) } + } else { + Buffer::from_vec(Vec::::new()) + }, + )) } /// Returns an [OffsetBuffer] which represents the contents of the [Tile]. diff --git a/tiledb/oxidize/arrow/src/schema.rs b/tiledb/oxidize/arrow/src/schema.rs index 4f3687d3a84..72106ea0d31 100644 --- a/tiledb/oxidize/arrow/src/schema.rs +++ b/tiledb/oxidize/arrow/src/schema.rs @@ -26,6 +26,8 @@ pub enum FieldError { InvalidCellValNum(CellValNum), #[error("Internal error: invalid discriminant for data type: {0}")] InternalInvalidDatatype(u8), + #[error("Internal error: enumeration not found: {0}")] + InternalEnumerationNotFound(String), } /// Wraps a [Schema] for passing across the FFI boundary. @@ -41,21 +43,32 @@ impl Deref for ArrowSchema { pub mod cxx { use super::*; + pub fn to_arrow(array_schema: &ArraySchema) -> Result, Error> { + Ok(Box::new(ArrowSchema(Arc::new(super::project_arrow( + array_schema, + |_: &Field| true, + )?)))) + } + /// Returns a [Schema] which represents the physical field types of /// the fields from `array_schema` which are contained in `select`. - pub fn to_arrow( + pub fn project_arrow( array_schema: &ArraySchema, - select: &::cxx::Vector<::cxx::String>, + select: &Vec, ) -> Result, Error> { - Ok(Box::new(ArrowSchema(Arc::new(super::to_arrow( + Ok(Box::new(ArrowSchema(Arc::new(super::project_arrow( array_schema, - |field: &Field| select.iter().any(|s| s == field.name_cxx()), + |field: &Field| select.iter().any(|s| s.as_str() == field.name_cxx()), )?)))) } } +pub fn to_arrow(array_schema: &ArraySchema) -> Result { + project_arrow(array_schema, |_: &Field| true) +} + /// Returns a [Schema] which represents the physical field types of the selected fields from `array_schema`. -pub fn to_arrow(array_schema: &ArraySchema, select: F) -> Result +pub fn project_arrow(array_schema: &ArraySchema, select: F) -> Result where F: Fn(&Field) -> bool, { @@ -63,8 +76,8 @@ where let field_name = f .name() .map_err(|e| Error::NameNotUtf8(f.name_cxx().as_bytes().to_vec(), e))?; - let arrow_type = - field_arrow_datatype(&f).map_err(|e| Error::FieldError(field_name.to_owned(), e))?; + let arrow_type = field_arrow_datatype(array_schema, &f) + .map_err(|e| Error::FieldError(field_name.to_owned(), e))?; // NB: fields can always be null due to schema evolution Ok(ArrowField::new(field_name, arrow_type, true)) @@ -77,32 +90,72 @@ where } /// Returns an [ArrowDataType] which represents the physical data type of `field`. -pub fn field_arrow_datatype(field: &Field) -> Result { - match field.cell_val_num() { - CellValNum::Single => Ok(arrow_datatype(field.datatype())?), +pub fn field_arrow_datatype( + array_schema: &ArraySchema, + field: &Field, +) -> Result { + if let Some(e_name) = field.enumeration_name_cxx() { + if !array_schema.has_enumeration(e_name) { + return Err(FieldError::InternalEnumerationNotFound( + e_name.to_string_lossy().into_owned(), + )); + } + + let enumeration = array_schema.enumeration_cxx(e_name); + + let key_type = arrow_datatype(field.datatype(), field.cell_val_num())?; + let value_type = if let Some(enumeration) = enumeration.as_ref() { + arrow_datatype(enumeration.datatype(), enumeration.cell_val_num())? + } else { + // NB: we don't necessarily want to return an error here + // because the enumeration might not actually be used + // in a predicate. We can return some representation + // which we will check later if it is actually used, + // and return an error then. + ArrowDataType::Null + }; + Ok(ArrowDataType::Dictionary( + Box::new(key_type), + Box::new(value_type), + )) + } else { + arrow_datatype(field.datatype(), field.cell_val_num()) + } +} + +pub fn arrow_datatype( + datatype: Datatype, + cell_val_num: CellValNum, +) -> Result { + match cell_val_num { + CellValNum::Single => Ok(arrow_primitive_datatype(datatype)?), CellValNum::Fixed(nz) => { if let Ok(fixed_length) = i32::try_from(nz.get()) { - let value_type = arrow_datatype(field.datatype())?; + let value_type = arrow_primitive_datatype(datatype)?; Ok(ArrowDataType::FixedSizeList( Arc::new(ArrowField::new_list_field(value_type, false)), fixed_length, )) } else { // cell val num greater than i32::MAX - Err(FieldError::InvalidCellValNum(field.cell_val_num())) + Err(FieldError::InvalidCellValNum(cell_val_num)) } } CellValNum::Var => { - let value_type = arrow_datatype(field.datatype())?; - Ok(ArrowDataType::LargeList(Arc::new( - ArrowField::new_list_field(value_type, false), - ))) + if matches!(datatype, Datatype::STRING_ASCII | Datatype::STRING_UTF8) { + Ok(ArrowDataType::LargeUtf8) + } else { + let value_type = arrow_primitive_datatype(datatype)?; + Ok(ArrowDataType::LargeList(Arc::new( + ArrowField::new_list_field(value_type, false), + ))) + } } } } /// Returns an [ArrowDataType] which represents the physical type of a single value of `datatype`. -pub fn arrow_datatype(datatype: Datatype) -> Result { +pub fn arrow_primitive_datatype(datatype: Datatype) -> Result { Ok(match datatype { Datatype::INT8 => ArrowDataType::Int8, Datatype::INT16 => ArrowDataType::Int16, diff --git a/tiledb/oxidize/cxx-interface/cc/array_schema.h b/tiledb/oxidize/cxx-interface/cc/array_schema.h index 5e3dfd3213d..464d9e877bc 100644 --- a/tiledb/oxidize/cxx-interface/cc/array_schema.h +++ b/tiledb/oxidize/cxx-interface/cc/array_schema.h @@ -27,4 +27,8 @@ void set_tile_extent(Dimension& dimension, rust::Slice domain); } // namespace dimension +namespace enumeration { +using ConstEnumeration = const tiledb::sm::Enumeration; +} + } // namespace tiledb::oxidize::sm diff --git a/tiledb/oxidize/cxx-interface/src/sm/array_schema/mod.rs b/tiledb/oxidize/cxx-interface/src/sm/array_schema/mod.rs index 03f3ff92b9d..9a83cd3ed60 100644 --- a/tiledb/oxidize/cxx-interface/src/sm/array_schema/mod.rs +++ b/tiledb/oxidize/cxx-interface/src/sm/array_schema/mod.rs @@ -79,6 +79,11 @@ mod ffi { fn datatype(&self) -> Datatype; } + #[namespace = "tiledb::oxidize::sm::enumeration"] + unsafe extern "C++" { + type ConstEnumeration; + } + #[namespace = "tiledb::sm"] unsafe extern "C++" { include!("tiledb/sm/array_schema/array_schema.h"); @@ -91,12 +96,18 @@ mod ffi { fn is_attr(&self, name: &CxxString) -> bool; fn is_dim(&self, name: &CxxString) -> bool; + fn has_attribute(&self, name: &CxxString) -> bool; + fn has_enumeration(&self, name: &CxxString) -> bool; + #[cxx_name = "attribute"] fn attribute_by_idx(&self, idx: u32) -> *const Attribute; #[cxx_name = "attribute"] fn attribute_by_name(&self, name: &CxxString) -> *const Attribute; + #[cxx_name = "get_enumeration"] + fn const_enumeration_cxx(&self, name: &CxxString) -> SharedPtr; + #[cxx_name = "cell_val_num"] fn cell_val_num_cxx(&self, name: &CxxString) -> u32; @@ -119,6 +130,7 @@ mod ffi { impl SharedPtr {} impl SharedPtr {} impl SharedPtr {} + impl SharedPtr {} impl SharedPtr {} impl UniquePtr {} impl UniquePtr {} @@ -239,17 +251,19 @@ impl Attribute { CellValNum::from_cxx(cxx).unwrap() } - pub fn enumeration_name_cxx(&self) -> *const cxx::CxxString { - ffi::enumeration_name_cxx(self) - } - - pub fn enumeration_name(&self) -> Option> { - let ptr = self.enumeration_name_cxx(); + pub fn enumeration_name_cxx(&self) -> Option<&cxx::CxxString> { + let ptr = ffi::enumeration_name_cxx(self); if ptr.is_null() { return None; } - let cxx = unsafe { &*ptr }; - Some(cxx.to_str()) + Some(unsafe { + // SAFETY: null check above + &*ptr + }) + } + + pub fn enumeration_name(&self) -> Option> { + self.enumeration_name_cxx().map(|s| s.to_str()) } } @@ -304,6 +318,13 @@ impl Field<'_> { } } + pub fn enumeration_name_cxx(&self) -> Option<&cxx::CxxString> { + match self { + Self::Attribute(a) => a.enumeration_name_cxx(), + Self::Dimension(_) => None, + } + } + pub fn enumeration_name(&self) -> Option> { match self { Self::Attribute(a) => a.enumeration_name(), @@ -365,4 +386,20 @@ impl ArraySchema { .map(Field::Dimension) .chain(self.attributes().map(Field::Attribute)) } + + pub fn enumeration_cxx(&self, name: &cxx::CxxString) -> cxx::SharedPtr { + let e = self.const_enumeration_cxx(name); + assert_eq!( + std::mem::size_of::>(), + std::mem::size_of::>() + ); + unsafe { + // SAFETY: + // 1) SharedPtr has the same representation regardless of generic + // 2) the deleter for `Enumeration` and `const Enumeration` is the same + // 3) the `cxx::SharedPtr` Rust API does not provide a (safe) way to + // get a mutable reference, so this transmutation preserves const-ness + std::mem::transmute::<_, cxx::SharedPtr>(e) + } + } } diff --git a/tiledb/oxidize/expr/Cargo.toml b/tiledb/oxidize/expr/Cargo.toml index b2d27d82f76..9313852ce89 100644 --- a/tiledb/oxidize/expr/Cargo.toml +++ b/tiledb/oxidize/expr/Cargo.toml @@ -6,6 +6,7 @@ version = { workspace = true } [dependencies] anyhow = { workspace = true } +arrow = { workspace = true } cxx = { workspace = true } datafusion = { workspace = true } itertools = { workspace = true } diff --git a/tiledb/oxidize/expr/src/lib.rs b/tiledb/oxidize/expr/src/lib.rs index 3c7995b928a..8362ed2f5ae 100644 --- a/tiledb/oxidize/expr/src/lib.rs +++ b/tiledb/oxidize/expr/src/lib.rs @@ -23,13 +23,20 @@ mod ffi { #[namespace = "tiledb::oxidize::datafusion::logical_expr"] extern "Rust" { type LogicalExpr; + fn is_predicate(&self, schema: &ArraySchema) -> Result; + fn has_aggregate_functions(&self) -> bool; fn to_string(&self) -> String; + fn columns(&self) -> Vec; + #[cxx_name = "create"] fn query_condition_to_logical_expr( schema: &ArraySchema, query_condition: &ASTNode, ) -> Result>; + + /// Returns a conjunction of the logical exprs `e1 AND e2 AND ... AND eN`. + fn make_conjunction(exprs: &[Box]) -> Box; } #[namespace = "tiledb::oxidize::datafusion::physical_expr"] @@ -64,6 +71,11 @@ mod logical_expr; mod physical_expr; mod query_condition; -pub use logical_expr::LogicalExpr; +pub use logical_expr::{LogicalExpr, make_conjunction}; pub use physical_expr::{PhysicalExpr, PhysicalExprOutput, create_physical_expr}; pub use query_condition::to_datafusion as query_condition_to_logical_expr; + +unsafe impl cxx::ExternType for LogicalExpr { + type Id = cxx::type_id!("tiledb::oxidize::datafusion::logical_expr::LogicalExpr"); + type Kind = cxx::kind::Opaque; +} diff --git a/tiledb/oxidize/expr/src/logical_expr.rs b/tiledb/oxidize/expr/src/logical_expr.rs index b5c57db0b97..d805d5d649a 100644 --- a/tiledb/oxidize/expr/src/logical_expr.rs +++ b/tiledb/oxidize/expr/src/logical_expr.rs @@ -2,13 +2,89 @@ use std::fmt::{Display, Formatter, Result as FmtResult}; -use datafusion::logical_expr::Expr; +use arrow::datatypes::DataType as ArrowDataType; +use datafusion::common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}; +use datafusion::common::{Column, DFSchema, DataFusionError, ScalarValue}; +use datafusion::logical_expr::{Expr, ExprSchemable}; +use tiledb_cxx_interface::sm::array_schema::ArraySchema; + +#[derive(Debug, thiserror::Error)] +pub enum TypeError { + #[error("Schema error: {0}")] + ArraySchema(#[from] tiledb_arrow::schema::Error), + #[error("Expression error: {0}")] + Expr(#[from] DataFusionError), +} /// Wraps a DataFusion [Expr] for passing across the FFI boundary. pub struct LogicalExpr(pub Expr); +impl LogicalExpr { + pub fn columns(&self) -> Vec { + self.0 + .column_refs() + .into_iter() + .map(|c| c.name.clone()) + .collect() + } + + pub fn output_type(&self, schema: &ArraySchema) -> Result { + let cols = self.0.column_refs(); + let arrow_schema = tiledb_arrow::schema::project_arrow(schema, |f| { + let Ok(field_name) = f.name() else { + // NB: if the field name is not UTF-8 then it cannot possibly match the column name + return false; + }; + cols.contains(&Column::new_unqualified(field_name)) + })?; + let dfschema = { + // SAFETY: the only error we can get from the above is if the arrow schema + // has duplicate names, which will not happen since it was constructed from + // an ArraySchema which does not allow duplicate names + DFSchema::try_from(arrow_schema).unwrap() + }; + + Ok(self.0.get_type(&dfschema)?) + } + + pub fn has_aggregate_functions(&self) -> bool { + let rec = self.0.visit(&mut AggregateFunctionChecker::default()); + let rec = { + // SAFETY: AggregateFunctionChecker does not return any errors + rec.unwrap() + }; + matches!(rec, TreeNodeRecursion::Stop) + } + + pub fn is_predicate(&self, schema: &ArraySchema) -> Result { + Ok(matches!(self.output_type(schema)?, ArrowDataType::Boolean)) + } +} + impl Display for LogicalExpr { fn fmt(&self, f: &mut Formatter) -> FmtResult { self.0.human_display().fmt(f) } } + +pub fn make_conjunction(exprs: &[Box]) -> Box { + Box::new(LogicalExpr( + datafusion::logical_expr::utils::conjunction(exprs.iter().map(|e| e.0.clone())) + .unwrap_or(Expr::Literal(ScalarValue::Boolean(Some(true)))), + )) +} + +#[derive(Default)] +struct AggregateFunctionChecker {} + +impl TreeNodeVisitor<'_> for AggregateFunctionChecker { + type Node = Expr; + + fn f_down(&mut self, node: &Self::Node) -> Result { + if matches!(node, Expr::AggregateFunction(_)) { + Ok(TreeNodeRecursion::Stop) + } else { + Ok(TreeNodeRecursion::Continue) + } + } +} diff --git a/tiledb/oxidize/expr/src/physical_expr.rs b/tiledb/oxidize/expr/src/physical_expr.rs index 911e99891e2..315f97dbb12 100644 --- a/tiledb/oxidize/expr/src/physical_expr.rs +++ b/tiledb/oxidize/expr/src/physical_expr.rs @@ -86,7 +86,7 @@ impl PhysicalExprOutput { &self, datatype: Datatype, ) -> Result, PhysicalExprOutputError> { - let arrow_type = tiledb_arrow::schema::arrow_datatype(datatype) + let arrow_type = tiledb_arrow::schema::arrow_primitive_datatype(datatype) .map_err(PhysicalExprOutputError::TypeUnavailable)?; let columnar_value = match &self.0 { ColumnarValue::Scalar(s) => ColumnarValue::Scalar( diff --git a/tiledb/oxidize/expr/src/query_condition.rs b/tiledb/oxidize/expr/src/query_condition.rs index abadb21a724..70101cedd96 100644 --- a/tiledb/oxidize/expr/src/query_condition.rs +++ b/tiledb/oxidize/expr/src/query_condition.rs @@ -113,7 +113,12 @@ fn leaf_ast_to_binary_expr( ); }; - fn apply(field: &Field, ast: &ASTNode, operator: Operator) -> Result + fn apply( + schema: &ArraySchema, + field: &Field, + ast: &ASTNode, + operator: Operator, + ) -> Result where T: FromBytes, ::Bytes: for<'a> TryFrom<&'a [u8]>, @@ -127,9 +132,10 @@ fn leaf_ast_to_binary_expr( .map(ScalarValue::from) .peekable(); - let expect_datatype = tiledb_arrow::schema::field_arrow_datatype(field).map_err(|e| { - InternalError::SchemaField(field.name_cxx().to_string_lossy().into_owned(), e) - })?; + let expect_datatype = + tiledb_arrow::schema::field_arrow_datatype(schema, field).map_err(|e| { + InternalError::SchemaField(field.name_cxx().to_string_lossy().into_owned(), e) + })?; let right = match field.cell_val_num() { CellValNum::Single => { @@ -199,7 +205,7 @@ fn leaf_ast_to_binary_expr( apply_physical_type!( value_type, NativeType, - apply::(&field, ast, op), + apply::(schema, &field, ast, op), |invalid: Datatype| Err(InternalError::InvalidDatatype(invalid.repr.into()).into()) ) } diff --git a/tiledb/oxidize/session/Cargo.toml b/tiledb/oxidize/session/Cargo.toml new file mode 100644 index 00000000000..e9976098a41 --- /dev/null +++ b/tiledb/oxidize/session/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "tiledb-session" +edition = { workspace = true } +rust-version = { workspace = true } +version = { workspace = true } + +[dependencies] +anyhow = { workspace = true } +cxx = { workspace = true } +datafusion = { workspace = true } +itertools = { workspace = true } +num-traits = { workspace = true } +thiserror = { workspace = true } +tiledb-arrow = { workspace = true } +tiledb-cxx-interface = { workspace = true } +tiledb-expr = { workspace = true } + +[build-dependencies] +cxx-build = { workspace = true } diff --git a/tiledb/oxidize/session/build.rs b/tiledb/oxidize/session/build.rs new file mode 100644 index 00000000000..aa06e631fed --- /dev/null +++ b/tiledb/oxidize/session/build.rs @@ -0,0 +1,4 @@ +fn main() { + let _bridge = cxx_build::bridge("src/lib.rs"); + println!("cargo:rerun-if-changed=src/lib.rs"); +} diff --git a/tiledb/oxidize/session/src/lib.rs b/tiledb/oxidize/session/src/lib.rs new file mode 100644 index 00000000000..69dbf5c24f3 --- /dev/null +++ b/tiledb/oxidize/session/src/lib.rs @@ -0,0 +1,101 @@ +#[cxx::bridge] +mod ffi { + #[namespace = "tiledb::sm"] + extern "C++" { + include!("tiledb/sm/array_schema/array_schema.h"); + + type ArraySchema = tiledb_cxx_interface::sm::array_schema::ArraySchema; + } + + #[namespace = "tiledb::oxidize::datafusion::logical_expr"] + extern "Rust" { + type ExternLogicalExpr; + } + + #[namespace = "tiledb::oxidize::datafusion::session"] + extern "Rust" { + type Session; + + fn new_session() -> Box; + + #[cxx_name = "parse_expr"] + fn parse_expr_ffi( + &self, + expr: &str, + array_schema: &ArraySchema, + ) -> Result>; + } +} + +#[repr(transparent)] +struct ExternLogicalExpr(pub LogicalExpr); + +unsafe impl cxx::ExternType for ExternLogicalExpr { + type Id = cxx::type_id!("tiledb::oxidize::datafusion::logical_expr::LogicalExpr"); + type Kind = cxx::kind::Opaque; +} + +fn new_session() -> Box { + Box::new(Session::new()) +} + +use datafusion::common::DFSchema; +use datafusion::common::tree_node::TreeNode; +use datafusion::execution::context::SessionContext; +use datafusion::execution::session_state::SessionStateBuilder; +use datafusion::logical_expr::Expr; +use tiledb_cxx_interface::sm::array_schema::ArraySchema; +use tiledb_expr::LogicalExpr; + +#[derive(Debug, thiserror::Error)] +pub enum ParseExprError { + #[error("Schema error: {0}")] + Schema(#[from] tiledb_arrow::schema::Error), + #[error("Parse error: {0}")] + Parse(#[source] datafusion::common::DataFusionError), + #[error("Type coercion error: {0}")] + TypeCoercion(#[source] datafusion::common::DataFusionError), +} + +/// Wraps a DataFusion [SessionContext] for passing across the FFI boundary. +pub struct Session(pub SessionContext); + +impl Session { + pub fn new() -> Self { + Self(SessionContext::from( + SessionStateBuilder::new_with_default_features().build(), + )) + } + + fn parse_expr_ffi( + &self, + expr: &str, + array_schema: &ArraySchema, + ) -> Result, ParseExprError> { + let e = self.parse_expr(expr, array_schema)?; + Ok(Box::new(ExternLogicalExpr(LogicalExpr(e)))) + } + + fn parse_expr(&self, expr: &str, array_schema: &ArraySchema) -> Result { + let arrow_schema = tiledb_arrow::schema::to_arrow(array_schema)?; + let df_schema = { + // SAFETY: this only errors if the names are not unique, + // which they will be because `ArraySchema` requires it + DFSchema::try_from(arrow_schema).unwrap() + }; + + let parsed = self + .0 + .parse_sql_expr(expr, &df_schema) + .map_err(ParseExprError::Parse)?; + + let mut coercion_rewriter = + datafusion::optimizer::analyzer::type_coercion::TypeCoercionRewriter::new(&df_schema); + //.map_err(ParseExprError::TypeCoercion)?; + + parsed + .rewrite(&mut coercion_rewriter) + .map(|t| t.data) + .map_err(ParseExprError::TypeCoercion) + } +} diff --git a/tiledb/oxidize/staticlibs/core-objects/Cargo.toml b/tiledb/oxidize/staticlibs/core-objects/Cargo.toml index 54e72f5d4cc..51bc95648bc 100644 --- a/tiledb/oxidize/staticlibs/core-objects/Cargo.toml +++ b/tiledb/oxidize/staticlibs/core-objects/Cargo.toml @@ -7,6 +7,7 @@ version = { workspace = true } [dependencies] tiledb-arrow = { workspace = true } tiledb-expr = { workspace = true } +tiledb-session = { workspace = true } [lib] name = "tiledb_core_objects_rs" diff --git a/tiledb/oxidize/staticlibs/core-objects/src/lib.rs b/tiledb/oxidize/staticlibs/core-objects/src/lib.rs index 85dea7f0663..0f4513456cb 100644 --- a/tiledb/oxidize/staticlibs/core-objects/src/lib.rs +++ b/tiledb/oxidize/staticlibs/core-objects/src/lib.rs @@ -1,2 +1,3 @@ pub use tiledb_arrow; pub use tiledb_expr; +pub use tiledb_session; diff --git a/tiledb/sm/c_api/tiledb.cc b/tiledb/sm/c_api/tiledb.cc index 27fb0a18c48..f6f503c85bf 100644 --- a/tiledb/sm/c_api/tiledb.cc +++ b/tiledb/sm/c_api/tiledb.cc @@ -584,6 +584,22 @@ int32_t tiledb_query_set_condition( return TILEDB_OK; } +capi_return_t tiledb_query_add_predicate( + tiledb_ctx_t* const ctx, + tiledb_query_t* const query, + const char* const predicate) { + // Sanity check + if (sanity_check(ctx, query) == TILEDB_ERR) { + return TILEDB_ERR; + } else if (predicate == nullptr) { + throw CAPIStatusException("Argument \"predicate\" may not be NULL"); + } + + throw_if_not_ok(query->query_->add_predicate(predicate)); + + return TILEDB_OK; +} + int32_t tiledb_query_finalize(tiledb_ctx_t* ctx, tiledb_query_t* query) { // Trivial case if (query == nullptr) @@ -2748,6 +2764,15 @@ CAPI_INTERFACE( return api_entry(ctx, query, cond); } +CAPI_INTERFACE( + query_add_predicate, + tiledb_ctx_t* const ctx, + tiledb_query_t* const query, + const char* const predicate) { + return api_entry( + ctx, query, predicate); +} + CAPI_INTERFACE(query_finalize, tiledb_ctx_t* ctx, tiledb_query_t* query) { return api_entry(ctx, query); } diff --git a/tiledb/sm/c_api/tiledb_experimental.h b/tiledb/sm/c_api/tiledb_experimental.h index 29c4c31fd94..4cf5468f25a 100644 --- a/tiledb/sm/c_api/tiledb_experimental.h +++ b/tiledb/sm/c_api/tiledb_experimental.h @@ -454,6 +454,34 @@ TILEDB_EXPORT int32_t tiledb_query_condition_set_use_enumeration( const tiledb_query_condition_t* cond, int use_enumeration) TILEDB_NOEXCEPT; +/* ********************************* */ +/* QUERY PREDICATE */ +/* ********************************* */ + +/** + * Adds a predicate to be applied to a read query. The added predicate + * will be analyzed and evaluated in the subarray step, query condition + * step, or both. + * + * The predicate is parsed as a SQL expression and must evaluate + * to a boolean. + * + * **Example:** + * + * @code{.c} + * const char* pred = "(row BETWEEN 1 AND 10) OR (column BETWEEN 1 AND 10)"; + * tiledb_query_add_predicate(ctx, query, pred); + * @endcode + * + * @param ctx The TileDB context. + * @param query The TileDB query. + * @param predicate A text representation of the desired predicate. + */ +TILEDB_EXPORT capi_return_t tiledb_query_add_predicate( + tiledb_ctx_t* ctx, + tiledb_query_t* query, + const char* predicate) TILEDB_NOEXCEPT; + /* ********************************* */ /* QUERY STATUS DETAILS */ /* ********************************* */ diff --git a/tiledb/sm/cpp_api/query.h b/tiledb/sm/cpp_api/query.h index 2e5308180d2..10d943929d9 100644 --- a/tiledb/sm/cpp_api/query.h +++ b/tiledb/sm/cpp_api/query.h @@ -249,6 +249,23 @@ class Query { return *this; } + /** + * Adds a predicate. The predicate will be analyzed and evaluated + * in the subarray step, query condition step, or both. + * + * The predicate is parsed as a SQL expression and must evaluate + * to a boolean. + * + * @param predicate a SQL representation of the predicate + * @return Reference to this Query + */ + Query& add_predicate(const std::string& predicate) { + auto& ctx = ctx_.get(); + ctx.handle_error(tiledb_query_add_predicate( + ctx.ptr().get(), query_.get(), predicate.c_str())); + return *this; + } + /** Returns the array of the query. */ const Array& array() { return array_; diff --git a/tiledb/sm/cpp_api/query_experimental.h b/tiledb/sm/cpp_api/query_experimental.h index 1cf8f052c28..94cbd9abe91 100644 --- a/tiledb/sm/cpp_api/query_experimental.h +++ b/tiledb/sm/cpp_api/query_experimental.h @@ -68,6 +68,24 @@ class QueryExperimental { update_value_size)); } + /** + * Adds a predicate to be applied to a read query. The added predicate + * will be analyzed and evaluated in the subarray step, query condition + * step, or both. + * + * The predicate is parsed as a SQL expression and must evaluate + * to a boolean. + * + * @param ctx The TileDB context. + * @param query The TileDB query. + * @param predicate A text representation of the desired predicate. + */ + static void add_predicate( + const Context& ctx, Query& query, const char* predicate) { + ctx.handle_error(tiledb_query_add_predicate( + ctx.ptr().get(), query.ptr().get(), predicate)); + } + /** * Get the number of relevant fragments from the subarray. Should only be * called after size estimation was asked for. diff --git a/tiledb/sm/query/query.cc b/tiledb/sm/query/query.cc index 0fbd84db127..7764f4acaa9 100644 --- a/tiledb/sm/query/query.cc +++ b/tiledb/sm/query/query.cc @@ -59,6 +59,11 @@ #include "tiledb/sm/storage_manager/storage_manager.h" #include "tiledb/sm/tile/writer_tile_tuple.h" +#ifdef HAVE_RUST +#include "tiledb/oxidize/expr.h" +#include "tiledb/oxidize/session.h" +#endif + #include #include #include @@ -724,6 +729,27 @@ void Query::init() { fragment_name_)); } + if (!predicates_.empty()) { + try { + // treat existing query condition (if any) as datafusion + if (condition_.has_value()) { + predicates_.push_back(condition_->as_datafusion(array_schema())); + condition_.reset(); + } + + // join them together + rust::Slice> + preds(predicates_.data(), predicates_.size()); + auto conjunction = + tiledb::oxidize::datafusion::logical_expr::make_conjunction(preds); + condition_.emplace(array_schema(), std::move(conjunction)); + } catch (const rust::Error& e) { + throw QueryException( + "Error initializing predicates: " + std::string(e.what())); + } + } + // Create the query strategy if querying main array and the Subarray does // not need to be updated. if (!only_dim_label_query() && !subarray_.has_label_ranges()) { @@ -1494,6 +1520,49 @@ Status Query::set_condition(const QueryCondition& condition) { return Status::Ok(); } +Status Query::add_predicate(const char* predicate) { + if (type_ != QueryType::READ) { + return logger_->status( + Status_QueryError("Cannot add query predicate; Operation only " + "applicable to read queries")); + } + if (status_ != tiledb::sm::QueryStatus::UNINITIALIZED) { + return logger_->status(Status_QueryError( + "Cannot add query predicate; Adding a predicate to an already " + "initialized query is not supported.")); + } + + try { + auto box_extern_expr = + resources_.session().parse_expr(predicate, array_schema()); + auto extern_expr = box_extern_expr.into_raw(); + + // NB: Rust cxx does not have a way to have crate A construct and return + // an opaque Rust type which is defined in crate B. So above we create an + // "ExternLogicalExpr" whose representation is exactly that of + // LogicalExpr, and we can transmute the raw pointer after un-boxing it. + // This is all quite unsafe but that's life at the FFI boundary. For now, + // hopefully. + using LogicalExpr = tiledb::oxidize::datafusion::logical_expr::LogicalExpr; + auto expr = rust::Box::from_raw( + reinterpret_cast(extern_expr)); + + if (!expr->is_predicate(array_schema())) { + return Status_QueryError("Expression does not return a boolean value"); + } + if (expr->has_aggregate_functions()) { + return Status_QueryError( + "Aggregate functions in predicate is not supported"); + } + predicates_.push_back(std::move(expr)); + } catch (const rust::Error& e) { + return Status_QueryError( + "Error adding predicate: " + std::string(e.what())); + } + + return Status::Ok(); +} + Status Query::add_update_value( const char* field_name, const void* update_value, @@ -1657,7 +1726,8 @@ Status Query::submit() { throw_if_not_ok(create_strategy()); // Allocate remote buffer storage for global order writes if necessary. - // If we cache an entire write a query may be uninitialized for N submits. + // If we cache an entire write a query may be uninitialized for N + // submits. if (!query_remote_buffer_storage_.has_value() && type_ == QueryType::WRITE && layout_ == Layout::GLOBAL_ORDER) { query_remote_buffer_storage_.emplace(*this, buffers_); @@ -1791,8 +1861,8 @@ bool Query::is_aggregate(std::string output_field_name) const { /* ****************************** */ Layout Query::effective_layout() const { - // If the user has not set a layout, it will default to row-major, which will - // use the legacy reader on sparse arrays, and fail if aggregates were + // If the user has not set a layout, it will default to row-major, which + // will use the legacy reader on sparse arrays, and fail if aggregates were // specified. However, if only aggregates are specified and no regular data // buffers, the layout doesn't matter and we can transparently switch to the // much more efficient unordered layout. @@ -1876,14 +1946,16 @@ Status Query::create_strategy(bool skip_checks_serialization) { all_dense &= frag_md->dense(); } - // We are going to deprecate dense arrays with sparse fragments in 2.27 but - // log a warning for now. + // We are going to deprecate dense arrays with sparse fragments in 2.27 + // but log a warning for now. if (array_schema_->dense() && !all_dense) { LOG_WARN( "This dense array contains sparse fragments. Support for reading " - "sparse fragments in dense arrays will be removed in TileDB version " + "sparse fragments in dense arrays will be removed in TileDB " + "version " "2.27 to be released in September 2024. To make sure this array " - "continues to work after an upgrade to version 2.27 or later, please " + "continues to work after an upgrade to version 2.27 or later, " + "please " "consolidate the sparse fragments using a TileDB version 2.26 or " "earlier."); } @@ -2008,8 +2080,8 @@ Status Query::check_buffer_names() { "cells to be written")); } - // All attributes/dimensions must be provided unless this query is only for - // dimension labels. + // All attributes/dimensions must be provided unless this query is only + // for dimension labels. if (!only_dim_label_query() && !allow_separate_attribute_writes()) { auto expected_num = array_schema_->attribute_num(); expected_num += static_cast( diff --git a/tiledb/sm/query/query.h b/tiledb/sm/query/query.h index 5d39ed133cf..ec931b9e190 100644 --- a/tiledb/sm/query/query.h +++ b/tiledb/sm/query/query.h @@ -58,8 +58,16 @@ #include "tiledb/sm/storage_manager/cancellation_source.h" #include "tiledb/sm/subarray/subarray.h" +#ifdef HAVE_RUST +#include "tiledb/oxidize/rust.h" +#endif + using namespace tiledb::common; +namespace tiledb::oxidize::datafusion::logical_expr { +class LogicalExpr; +} + namespace tiledb::sm { class Array; @@ -644,6 +652,14 @@ class Query { */ Status set_condition(const QueryCondition& condition); + /** + * Adds a predicate for filtering results in a read query. + * + * @param predicate A string representation of the desired predicate. + * @return Status + */ + Status add_predicate(const char* predicate); + /** * Adds an update value for an update query. * @@ -1022,6 +1038,9 @@ class Query { /** The query condition. */ std::optional condition_; + std::vector> + predicates_; + /** The update values. */ std::vector update_values_; diff --git a/tiledb/sm/query/query_condition.cc b/tiledb/sm/query/query_condition.cc index cd0be2bf5d5..09bfefd62c9 100644 --- a/tiledb/sm/query/query_condition.cc +++ b/tiledb/sm/query/query_condition.cc @@ -105,6 +105,19 @@ QueryCondition::QueryCondition( , tree_(std::move(tree)) { } +#ifdef HAVE_RUST +QueryCondition::QueryCondition( + const ArraySchema& array_schema, + rust::Box&& expr) { + const auto columns = expr->columns(); + for (const auto& c : columns) { + field_names_.insert(std::string(c.data(), c.size())); + } + + datafusion_.emplace(array_schema, std::move(expr)); +} +#endif + QueryCondition::QueryCondition(const QueryCondition& rhs) : condition_marker_(rhs.condition_marker_) , condition_index_(rhs.condition_index_) @@ -168,22 +181,19 @@ void QueryCondition::rewrite_for_schema(const ArraySchema& array_schema) { } #ifdef HAVE_RUST +rust::Box +QueryCondition::as_datafusion(const ArraySchema& array_schema) { + return tiledb::oxidize::datafusion::logical_expr::create( + array_schema, *tree_.get()); +} + bool QueryCondition::rewrite_to_datafusion(const ArraySchema& array_schema) { if (!datafusion_.has_value()) { - std::vector select(field_names().begin(), field_names().end()); - try { - auto logical_expr = tiledb::oxidize::datafusion::logical_expr::create( - array_schema, *tree_.get()); - auto dfschema = - tiledb::oxidize::arrow::schema::create(array_schema, select); - auto physical_expr = tiledb::oxidize::datafusion::physical_expr::create( - *dfschema, std::move(logical_expr)); - - datafusion_.emplace(std::move(dfschema), std::move(physical_expr)); + datafusion_.emplace(array_schema, std::move(as_datafusion(array_schema))); } catch (const ::rust::Error& e) { - throw std::logic_error( - "Unexpected error compiling expression: " + std::string(e.what())); + throw QueryConditionException( + "Error compiling expression: " + std::string(e.what())); } return true; } @@ -1317,6 +1327,12 @@ Status QueryCondition::apply( const std::vector>& fragment_metadata, std::vector& result_cell_slabs, const uint64_t stride) const { +#ifdef HAVE_RUST + if (!tree_ && datafusion_.has_value()) { + throw QueryConditionException("TODO not supported"); + } +#endif + if (!tree_) { return Status::Ok(); } @@ -2145,6 +2161,13 @@ Status QueryCondition::apply_dense( return Status_QueryConditionError("The result buffer is null."); } +#ifdef HAVE_RUST + if (tree_ == nullptr && datafusion_.has_value()) { + return Status_QueryConditionError( + "tiledb_query_add_predicate is not supported for dense array queries"); + } +#endif + span result_span(result_buffer + start, length); apply_tree_dense( tree_, @@ -2924,8 +2947,8 @@ Status QueryCondition::apply_sparse( try { datafusion_.value().apply(params, result_tile, result_bitmap); } catch (const ::rust::Error& e) { - throw std::logic_error( - "Unexpected error evaluating expression: " + std::string(e.what())); + throw QueryConditionException( + "Error evaluating expression: " + std::string(e.what())); } } else { apply_tree_sparse( @@ -2960,6 +2983,15 @@ uint64_t QueryCondition::condition_index() const { } #ifdef HAVE_RUST +QueryCondition::Datafusion::Datafusion( + const ArraySchema& array_schema, + rust::Box&& expr) + : schema_(tiledb::oxidize::arrow::schema::project( + array_schema, expr->columns())) + , expr_(tiledb::oxidize::datafusion::physical_expr::create( + *schema_, std::move(expr))) { +} + template void QueryCondition::Datafusion::apply( const QueryCondition::Params&, @@ -2989,7 +3021,7 @@ void QueryCondition::Datafusion::apply( result_bitmap[i] *= bitmap[i]; } } else { - throw std::logic_error( + throw QueryConditionException( "Expression evaluation bitmap has unexpected size"); } } else { @@ -3010,7 +3042,7 @@ void QueryCondition::Datafusion::apply( result_bitmap[i] *= bitmap[i]; } } else { - throw std::logic_error( + throw QueryConditionException( "Expression evaluation bitmap has unexpected size"); } } diff --git a/tiledb/sm/query/query_condition.h b/tiledb/sm/query/query_condition.h index 0d9e827c4eb..66c9b458d4d 100644 --- a/tiledb/sm/query/query_condition.h +++ b/tiledb/sm/query/query_condition.h @@ -50,6 +50,9 @@ using namespace tiledb::common; namespace tiledb::oxidize::arrow::schema { struct ArrowSchema; } +namespace tiledb::oxidize::datafusion::logical_expr { +class LogicalExpr; +} namespace tiledb::oxidize::datafusion::physical_expr { struct PhysicalExpr; } @@ -150,6 +153,13 @@ class QueryCondition { const std::string& condition_marker, tdb_unique_ptr&& tree); +#ifdef HAVE_RUST + /** Constructor from a datafusion expression tree */ + QueryCondition( + const ArraySchema& array_schema, + rust::Box&& expr); +#endif + /** Copy constructor. */ QueryCondition(const QueryCondition& rhs); @@ -210,6 +220,13 @@ class QueryCondition { * @return true if a rewrite occurred, false otherwise */ bool rewrite_to_datafusion(const ArraySchema& array_schema); + + /** + * @return an equivalent representation of this condition's expression tree as + * a Datafusion logical expression + */ + rust::Box + as_datafusion(const ArraySchema& array_schema); #endif /** @@ -416,6 +433,11 @@ class QueryCondition { , expr_(std::move(expr)) { } + Datafusion( + const ArraySchema& array_schema, + rust::Box&& + expr); + template void apply( const QueryCondition::Params& params, diff --git a/tiledb/sm/storage_manager/context_resources.cc b/tiledb/sm/storage_manager/context_resources.cc index 521ec52e551..e70aa516da9 100644 --- a/tiledb/sm/storage_manager/context_resources.cc +++ b/tiledb/sm/storage_manager/context_resources.cc @@ -34,6 +34,10 @@ #include "tiledb/common/memory_tracker.h" #include "tiledb/sm/rest/rest_client.h" +#ifdef HAVE_RUST +#include "tiledb/oxidize/session.h" +#endif + using namespace tiledb::common; namespace tiledb::sm { @@ -64,7 +68,11 @@ ContextResources::ContextResources( config_, compute_tp(), *logger_.get(), - create_memory_tracker())} { + create_memory_tracker())} +#ifdef HAVE_RUST + , session_(tiledb::oxidize::datafusion::session::new_session()) +#endif +{ ephemeral_memory_tracker_->set_type(MemoryTrackerType::EPHEMERAL); serialization_memory_tracker_->set_type(MemoryTrackerType::SERIALIZATION); diff --git a/tiledb/sm/storage_manager/context_resources.h b/tiledb/sm/storage_manager/context_resources.h index 1ea083fdc2c..70105b349d5 100644 --- a/tiledb/sm/storage_manager/context_resources.h +++ b/tiledb/sm/storage_manager/context_resources.h @@ -39,8 +39,16 @@ #include "tiledb/sm/filesystem/vfs.h" #include "tiledb/sm/stats/global_stats.h" +#ifdef HAVE_RUST +#include "tiledb/oxidize/rust.h" +#endif + using namespace tiledb::common; +namespace tiledb::oxidize::datafusion::session { +class Session; +} + namespace tiledb::sm { class MemoryTracker; @@ -115,6 +123,12 @@ class ContextResources { return *memory_tracker_manager_; } +#ifdef HAVE_RUST + const tiledb::oxidize::datafusion::session::Session& session() const { + return *session_; + } +#endif + /** * Create a new MemoryTracker * @@ -184,6 +198,10 @@ class ContextResources { /** The rest client (may be null if none was configured). */ shared_ptr rest_client_; + +#ifdef HAVE_RUST + rust::Box session_; +#endif }; } // namespace tiledb::sm From ad3f43cec64a269616a3b6706e3734effdd22967 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Mon, 30 Jun 2025 09:17:06 -0400 Subject: [PATCH 02/52] Remove assert post rebase --- tiledb/oxidize/arrow/src/record_batch.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tiledb/oxidize/arrow/src/record_batch.rs b/tiledb/oxidize/arrow/src/record_batch.rs index 9fcd22d272b..095546cb72e 100644 --- a/tiledb/oxidize/arrow/src/record_batch.rs +++ b/tiledb/oxidize/arrow/src/record_batch.rs @@ -105,11 +105,6 @@ pub unsafe fn to_record_batch( .all(|(f, c)| f.data_type() == c.data_type()) ); - // SAFETY: `schema` has at least one field. - // This is not required in general, but `schema` is a projection of an `ArraySchema` - // which always has at least one dimension. - assert!(!columns.is_empty()); - // SAFETY: dependent on the correctness of `tile_to_arrow_array` AND the integrity of // the underlying `ResultTile`. // Neither of these conditions is a recoverable error from the user perspective - From c10ef48aa94615ed44cf974819ab0763db14af59 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Mon, 30 Jun 2025 09:17:26 -0400 Subject: [PATCH 03/52] Fill in WHERE a IS NOT NULL example --- test/src/unit-query-add-predicate.cc | 161 +++++++++++++++++---------- 1 file changed, 100 insertions(+), 61 deletions(-) diff --git a/test/src/unit-query-add-predicate.cc b/test/src/unit-query-add-predicate.cc index bd9fc2993a5..3edb972e46f 100644 --- a/test/src/unit-query-add-predicate.cc +++ b/test/src/unit-query-add-predicate.cc @@ -96,65 +96,72 @@ struct QueryAddPredicateFx { static const Cells INPUT; }; -const Cells QueryAddPredicateFx::INPUT = Cells{ - .d1_ = templates::query_buffers( - {1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4}), - .d2_ = templates::query_buffers( - {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4}), - .atts_ = std::make_tuple( - templates::query_buffers>( - std::vector>{ - 15, - std::nullopt, - std::nullopt, - 12, - std::nullopt, - 10, - 9, - std::nullopt, - 7, - 6, - 5, - 4, - std::nullopt, - 2, - 1, - 0}), - templates::query_buffers(std::vector{ - "one", - "two", - "three", - "four", - "five", - "six", - "seven", - "eight", - "nine", - "ten", - "eleven", - "twelve", - "thirteen", - "fourteen", - "fifteen", - "sixteen"}), - templates::query_buffers>( - std::vector>{ - 4, - 4, - 7, - std::nullopt, - 7, - 7, - std::nullopt, - 0, - 1, - std::nullopt, - 3, - 4, - std::nullopt, - 6, - 7, - std::nullopt}))}; +static Cells make_cells( + std::vector d1, + std::vector d2, + std::vector> a, + std::vector v, + std::vector> e) { + return Cells{ + .d1_ = templates::query_buffers(d1), + .d2_ = templates::query_buffers(d2), + .atts_ = std::make_tuple( + templates::query_buffers>(a), + templates::query_buffers(v), + templates::query_buffers>(e))}; +} + +const Cells QueryAddPredicateFx::INPUT = make_cells( + {1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4}, + {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4}, + {15, + std::nullopt, + std::nullopt, + 12, + std::nullopt, + 10, + 9, + std::nullopt, + 7, + 6, + 5, + 4, + std::nullopt, + 2, + 1, + 0}, + {"one", + "two", + "three", + "four", + "five", + "six", + "seven", + "eight", + "nine", + "ten", + "eleven", + "twelve", + "thirteen", + "fourteen", + "fifteen", + "sixteen"}, + {4, + 4, + 7, + std::nullopt, + 7, + 7, + std::nullopt, + 0, + 1, + std::nullopt, + 3, + 4, + std::nullopt, + 6, + 7, + std::nullopt}); void QueryAddPredicateFx::create_array( const std::string& path, tiledb_array_type_t atype) { @@ -424,8 +431,40 @@ TEST_CASE_METHOD( CHECK(result == INPUT); } - SECTION("WHERE a IS NULL") { - // TODO + SECTION("WHERE a IS NOT NULL") { + const Cells expect = make_cells( + {1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4}, + {1, 4, 2, 3, 1, 2, 3, 4, 2, 3, 4}, + {15, 12, 10, 9, 7, 6, 5, 4, 2, 1, 0}, + {"one", + "four", + "six", + "seven", + "nine", + "ten", + "eleven", + "twelve", + "fourteen", + "fifteen", + "sixteen"}, + {4, + std::nullopt, + 7, + std::nullopt, + 1, + std::nullopt, + 3, + 4, + 6, + 7, + std::nullopt}); + const auto result = + query_array(array_name, TILEDB_GLOBAL_ORDER, "a IS NOT NULL"); + CHECK(result.d1_ == expect.d1_); + CHECK(result.d2_ == expect.d2_); + CHECK(std::get<0>(result.atts_) == std::get<0>(expect.atts_)); + CHECK(std::get<1>(result.atts_) == std::get<1>(expect.atts_)); + CHECK(std::get<2>(result.atts_) == std::get<2>(expect.atts_)); } SECTION("WHERE b < 'fourteen'") { From 6017582cce127caf08563899b85c29cf87339b97 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Mon, 30 Jun 2025 10:30:04 -0400 Subject: [PATCH 04/52] Fill in other global order unit test examples --- test/src/unit-query-add-predicate.cc | 75 ++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 11 deletions(-) diff --git a/test/src/unit-query-add-predicate.cc b/test/src/unit-query-add-predicate.cc index 3edb972e46f..ec3fb187d31 100644 --- a/test/src/unit-query-add-predicate.cc +++ b/test/src/unit-query-add-predicate.cc @@ -290,7 +290,7 @@ TEST_CASE_METHOD( "C API: Test query add predicate errors", "[capi][query][add_predicate]") { const std::string array_name = - vfs_test_setup_.array_uri("test_qeury_add_predicate_errors"); + vfs_test_setup_.array_uri("test_query_add_predicate_errors"); create_array(array_name, TILEDB_SPARSE); write_array(array_name, TILEDB_SPARSE); @@ -458,24 +458,77 @@ TEST_CASE_METHOD( 6, 7, std::nullopt}); + const auto result = query_array(array_name, TILEDB_GLOBAL_ORDER, "a IS NOT NULL"); - CHECK(result.d1_ == expect.d1_); - CHECK(result.d2_ == expect.d2_); - CHECK(std::get<0>(result.atts_) == std::get<0>(expect.atts_)); - CHECK(std::get<1>(result.atts_) == std::get<1>(expect.atts_)); - CHECK(std::get<2>(result.atts_) == std::get<2>(expect.atts_)); + CHECK(result == expect); } - SECTION("WHERE b < 'fourteen'") { - // TODO + SECTION("WHERE v < 'fourteen'") { + const Cells expect = make_cells( + {1, 2, 2, 3, 4}, + {4, 1, 4, 3, 3}, + {12, std::nullopt, std::nullopt, 5, 1}, + {"four", "five", "eight", "eleven", "fifteen"}, + {std::nullopt, 7, 0, 3, 7}); + + const auto result = + query_array(array_name, TILEDB_GLOBAL_ORDER, "v < 'fourteen'"); + CHECK(result == expect); } SECTION("WHERE row + col <= 4") { - // TODO + const Cells expect = make_cells( + {1, 1, 1, 2, 2, 3}, + {1, 2, 3, 1, 2, 1}, + {15, std::nullopt, std::nullopt, std::nullopt, 10, 7}, + {"one", "two", "three", "five", "six", "nine"}, + {4, 4, 7, 7, 7, 1}); + + const auto result = + query_array(array_name, TILEDB_GLOBAL_ORDER, "row + col <= 4"); + CHECK(result == expect); } - SECTION("WHERE coalesce(a, row) > a") { - // TODO + SECTION("WHERE a IS NULL AND row > col") { + const Cells expect = make_cells( + {2, 4}, + {1, 1}, + {std::nullopt, std::nullopt}, + {"five", "thirteen"}, + {7, std::nullopt}); + + const auto result = query_array( + array_name, TILEDB_GLOBAL_ORDER, {"a IS NULL", "row > col"}); + CHECK(result == expect); + } + + SECTION("WHERE coalesce(a, row) > col") { + const Cells expect = make_cells( + {1, 1, 2, 2, 2, 3, 3, 3, 4}, + {1, 4, 1, 2, 3, 1, 2, 3, 1}, + {15, 12, std::nullopt, 10, 9, 7, 6, 5, std::nullopt}, + {"one", + "four", + "five", + "six", + "seven", + "nine", + "ten", + "eleven", + "thirteen"}, + {4, + std::nullopt, + 7, + 7, + std::nullopt, + 1, + std::nullopt, + 3, + std::nullopt}); + + const auto result = query_array( + array_name, TILEDB_GLOBAL_ORDER, {"coalesce(a, row) > col"}); + CHECK(result == expect); } } From 303e83137c00d9c720b9698aa659d6b4ff70f787 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Mon, 30 Jun 2025 11:39:20 -0400 Subject: [PATCH 05/52] unit-query-add-predicate.cc tests for other readers --- test/src/unit-query-add-predicate.cc | 216 ++++++++++++++++++++++----- test/support/src/array_templates.h | 12 ++ tiledb/sm/query/query_condition.cc | 4 +- 3 files changed, 196 insertions(+), 36 deletions(-) diff --git a/test/src/unit-query-add-predicate.cc b/test/src/unit-query-add-predicate.cc index ec3fb187d31..a75f80c291c 100644 --- a/test/src/unit-query-add-predicate.cc +++ b/test/src/unit-query-add-predicate.cc @@ -65,6 +65,8 @@ using Cells = templates::Fragment2D< struct QueryAddPredicateFx { VFSTestSetup vfs_test_setup_; + static const Cells INPUT; + Context context() const { return vfs_test_setup_.ctx(); } @@ -75,25 +77,30 @@ struct QueryAddPredicateFx { * - 'v VARCHAR NOT NULL' * - 'e UINT8:VARCHAR' */ - void create_array(const std::string& path, tiledb_array_type_t atype); + void create_array( + const std::string& path, + tiledb_array_type_t atype, + bool allow_dups = false); /** * Writes cells to saturate the ranges [[1, 4], [1, 4]] for an array * of the schema given above */ - void write_array(const std::string& path, tiledb_array_type_t atype); + void write_array( + const std::string& path, + tiledb_array_type_t atype, + const Cells input = INPUT); Cells query_array( const std::string& path, tiledb_layout_t layout, - std::vector predicates); + std::vector predicates, + const Config& query_config = Config()); Cells query_array( const std::string& path, tiledb_layout_t layout, const char* predicate) { return query_array(path, layout, std::vector{predicate}); } - - static const Cells INPUT; }; static Cells make_cells( @@ -164,7 +171,7 @@ const Cells QueryAddPredicateFx::INPUT = make_cells( std::nullopt}); void QueryAddPredicateFx::create_array( - const std::string& path, tiledb_array_type_t atype) { + const std::string& path, tiledb_array_type_t atype, bool allow_dups) { auto ctx = context(); Domain domain(ctx); @@ -175,6 +182,7 @@ void QueryAddPredicateFx::create_array( schema.set_tile_order(TILEDB_ROW_MAJOR); schema.set_cell_order(TILEDB_ROW_MAJOR); schema.set_domain(domain); + schema.set_allows_dups(allow_dups); schema.add_attribute(Attribute::create(ctx, "a").set_nullable(true)); schema.add_attribute(Attribute::create(ctx, "v")); @@ -203,7 +211,7 @@ void QueryAddPredicateFx::create_array( } void QueryAddPredicateFx::write_array( - const std::string& path, tiledb_array_type_t atype) { + const std::string& path, tiledb_array_type_t atype, const Cells input) { auto ctx = context(); Array array(ctx, path, TILEDB_WRITE); Query query(ctx, array); @@ -218,7 +226,7 @@ void QueryAddPredicateFx::write_array( std::optional, std::vector, std::optional> - cells = {.atts_ = INPUT.atts_}; + cells = {.atts_ = input.atts_}; auto field_sizes = templates::query::make_field_sizes(cells); templates::query::set_fields( @@ -231,12 +239,12 @@ void QueryAddPredicateFx::write_array( query.submit(); } else { auto field_sizes = - templates::query::make_field_sizes(const_cast(INPUT)); + templates::query::make_field_sizes(const_cast(input)); templates::query::set_fields( ctx.ptr().get(), query.ptr().get(), field_sizes, - const_cast(INPUT), + const_cast(input), array.ptr().get()->array_schema_latest()); query.submit(); } @@ -245,13 +253,14 @@ void QueryAddPredicateFx::write_array( Cells QueryAddPredicateFx::query_array( const std::string& path, tiledb_layout_t layout, - std::vector predicates) { + std::vector predicates, + const Config& config) { auto ctx = context(); Array array(ctx, path, TILEDB_READ); Query query(ctx, array); - query.set_layout(layout); + query.set_config(config).set_layout(layout); Cells out; out.resize(32); @@ -382,7 +391,7 @@ TEST_CASE_METHOD( "C API: Test query add predicate dense", "[capi][query][add_predicate]") { const std::string array_name = - vfs_test_setup_.array_uri("test_qeury_add_predicate_dense"); + vfs_test_setup_.array_uri("test_query_add_predicate_dense"); create_array(array_name, TILEDB_DENSE); write_array(array_name, TILEDB_DENSE); @@ -395,25 +404,40 @@ TEST_CASE_METHOD( REQUIRE_THROWS(query_array(array_name, TILEDB_HILBERT, "row >= 3")); } -TEST_CASE_METHOD( - QueryAddPredicateFx, - "C API: Test query add predicate legacy", - "[capi][query][add_predicate]") { - const std::string array_name = - vfs_test_setup_.array_uri("test_qeury_add_predicate_legacy"); - // TODO -} - TEST_CASE_METHOD( QueryAddPredicateFx, "C API: Test query add predicate sparse unsupported query order", "[capi][query][add_predicate]") { const std::string array_name = - vfs_test_setup_.array_uri("test_qeury_add_predicate_sparse_unsupported"); + vfs_test_setup_.array_uri("test_query_add_predicate_sparse_unsupported"); create_array(array_name, TILEDB_SPARSE); write_array(array_name, TILEDB_SPARSE); - // TODO + + const auto match = Catch::Matchers::ContainsSubstring( + "This query does not support predicates added with " + "tiledb_query_add_predicate"); + + SECTION("Row major") { + REQUIRE_THROWS_WITH( + query_array(array_name, TILEDB_ROW_MAJOR, {"a IS NULL", "row > col"}), + match); + } + + SECTION("Col major") { + REQUIRE_THROWS_WITH( + query_array(array_name, TILEDB_COL_MAJOR, {"a IS NULL", "row > col"}), + match); + } + + SECTION("Legacy global order") { + Config qconf; + qconf["sm.query.sparse_global_order.reader"] = "legacy"; + REQUIRE_THROWS_WITH( + query_array( + array_name, TILEDB_GLOBAL_ORDER, {"a IS NULL", "row > col"}, qconf), + match); + } } TEST_CASE_METHOD( @@ -421,13 +445,15 @@ TEST_CASE_METHOD( "C API: Test query add predicate sparse global order", "[capi][query][add_predicate]") { const std::string array_name = - vfs_test_setup_.array_uri("test_qeury_add_predicate_sparse_global_order"); + vfs_test_setup_.array_uri("test_query_add_predicate_sparse_global_order"); + + const auto query_order = GENERATE(TILEDB_GLOBAL_ORDER, TILEDB_UNORDERED); create_array(array_name, TILEDB_SPARSE); write_array(array_name, TILEDB_SPARSE); SECTION("WHERE TRUE") { - const auto result = query_array(array_name, TILEDB_GLOBAL_ORDER, "TRUE"); + const auto result = query_array(array_name, query_order, "TRUE"); CHECK(result == INPUT); } @@ -459,8 +485,7 @@ TEST_CASE_METHOD( 7, std::nullopt}); - const auto result = - query_array(array_name, TILEDB_GLOBAL_ORDER, "a IS NOT NULL"); + const auto result = query_array(array_name, query_order, "a IS NOT NULL"); CHECK(result == expect); } @@ -472,8 +497,7 @@ TEST_CASE_METHOD( {"four", "five", "eight", "eleven", "fifteen"}, {std::nullopt, 7, 0, 3, 7}); - const auto result = - query_array(array_name, TILEDB_GLOBAL_ORDER, "v < 'fourteen'"); + const auto result = query_array(array_name, query_order, "v < 'fourteen'"); CHECK(result == expect); } @@ -485,8 +509,7 @@ TEST_CASE_METHOD( {"one", "two", "three", "five", "six", "nine"}, {4, 4, 7, 7, 7, 1}); - const auto result = - query_array(array_name, TILEDB_GLOBAL_ORDER, "row + col <= 4"); + const auto result = query_array(array_name, query_order, "row + col <= 4"); CHECK(result == expect); } @@ -498,8 +521,8 @@ TEST_CASE_METHOD( {"five", "thirteen"}, {7, std::nullopt}); - const auto result = query_array( - array_name, TILEDB_GLOBAL_ORDER, {"a IS NULL", "row > col"}); + const auto result = + query_array(array_name, query_order, {"a IS NULL", "row > col"}); CHECK(result == expect); } @@ -527,8 +550,131 @@ TEST_CASE_METHOD( 3, std::nullopt}); + const auto result = + query_array(array_name, query_order, "coalesce(a, row) > col"); + CHECK(result == expect); + } + + SECTION("WHERE e < 'california'") { + // enumeration not supported yet + REQUIRE_THROWS_WITH( + query_array(array_name, query_order, "e < 'california'"), + Catch::Matchers::ContainsSubstring( + "QueryCondition: Error evaluating expression: Cannot process field " + "'e': Attributes with enumerations are not supported in text " + "predicates")); + } +} + +TEST_CASE_METHOD( + QueryAddPredicateFx, + "C API: Test query add predicate sparse unordered with dups", + "[capi][query][add_predicate]") { + const std::string array_name = vfs_test_setup_.array_uri( + "test_query_add_predicate_sparse_unordered_with_dups"); + + create_array(array_name, TILEDB_SPARSE, true); + + const auto query_order = TILEDB_UNORDERED; + + const Cells f2 = make_cells( + {1, 1, 2, 2, 3, 3, 4, 4}, + {1, 4, 2, 3, 1, 4, 2, 3}, + {-1, std::nullopt, std::nullopt, -4, std::nullopt, -6, -7, std::nullopt}, + {"ένα", "δύο", "τρία", "τέσσερα", "πέντε", "έξι", "επτά", "οκτώ"}, + {0, 7, 1, std::nullopt, 2, 6, std::nullopt, 3}); + const Cells f3 = make_cells( + {1, 1, 2, 2, 3, 3, 4, 4}, + {1, 2, 3, 4, 1, 2, 3, 4}, + {-9, -10, -11, -12, std::nullopt, -14, -15, -16}, + {"uno", "dos", "tres", "quatro", "cinco", "seis", "siete", "ocho"}, + {7, 0, 6, std::nullopt, 1, 5, std::nullopt, 2}); + + // fragment 1: base input + write_array(array_name, TILEDB_SPARSE); + write_array(array_name, TILEDB_SPARSE, f2); + write_array(array_name, TILEDB_SPARSE, f3); + + SECTION("WHERE TRUE") { + const Cells expect = templates::query::concat({INPUT, f2, f3}); + const auto result = query_array(array_name, query_order, "TRUE"); + CHECK(result == expect); + } + + SECTION("WHERE v < 'fourteen'") { + const Cells expect = make_cells( + {1, 2, 2, 3, 4, 1, 3}, + {4, 1, 4, 3, 3, 2, 1}, + {12, std::nullopt, std::nullopt, 5, 1, -10, std::nullopt}, + {"four", "five", "eight", "eleven", "fifteen", "dos", "cinco"}, + {std::nullopt, 7, 0, 3, 7, 0, 1}); + + const auto result = query_array(array_name, query_order, "v < 'fourteen'"); + CHECK(result == expect); + } + + SECTION("WHERE row + col <= 4") { + const Cells expect = make_cells( + {1, 1, 1, 2, 2, 3, 1, 2, 3, 1, 1, 3}, + {1, 2, 3, 1, 2, 1, 1, 2, 1, 1, 2, 1}, + {15, + std::nullopt, + std::nullopt, + std::nullopt, + 10, + 7, + -1, + std::nullopt, + std::nullopt, + -9, + -10, + std::nullopt}, + {"one", + "two", + "three", + "five", + "six", + "nine", + "ένα", + "τρία", + "πέντε", + "uno", + "dos", + "cinco"}, + {4, 4, 7, 7, 7, 1, 0, 1, 2, 7, 0, 1}); + + const auto result = query_array(array_name, query_order, "row + col <= 4"); + CHECK(result == expect); + } + + SECTION("WHERE a IS NULL AND row > col") { + const Cells expect = make_cells( + {2, 4, 3, 4, 3}, + {1, 1, 1, 3, 1}, + {std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt}, + {"five", "thirteen", "πέντε", "οκτώ", "cinco"}, + {7, std::nullopt, 2, 3, 1}); + + const auto result = + query_array(array_name, query_order, {"a IS NULL", "row > col"}); + CHECK(result == expect); + } + + SECTION("WHERE octet_length(v) > char_length(v)") { + const Cells expect = f2; + const auto result = query_array( - array_name, TILEDB_GLOBAL_ORDER, {"coalesce(a, row) > col"}); + array_name, query_order, "octet_length(v) > char_length(v)"); CHECK(result == expect); } + + SECTION("WHERE e < 'california'") { + // enumeration not supported yet + REQUIRE_THROWS_WITH( + query_array(array_name, query_order, "e < 'california'"), + Catch::Matchers::ContainsSubstring( + "QueryCondition: Error evaluating expression: Cannot process field " + "'e': Attributes with enumerations are not supported in text " + "predicates")); + } } diff --git a/test/support/src/array_templates.h b/test/support/src/array_templates.h index f1b085e44b6..ad5dc56f660 100644 --- a/test/support/src/array_templates.h +++ b/test/support/src/array_templates.h @@ -1494,6 +1494,18 @@ void resize_fields(F& fragment, const auto& field_sizes) { std::tuple_cat(fragment.dimensions(), fragment.attributes())); } +template +F concat(std::initializer_list fragments) { + F concat; + auto d = concat.dimensions(); + auto a = concat.attributes(); + for (const F& fragment : fragments) { + stdx::extend(d, fragment.dimensions()); + stdx::extend(a, fragment.attributes()); + } + return concat; +} + } // namespace query } // namespace tiledb::test::templates diff --git a/tiledb/sm/query/query_condition.cc b/tiledb/sm/query/query_condition.cc index 09bfefd62c9..ab346a77c67 100644 --- a/tiledb/sm/query/query_condition.cc +++ b/tiledb/sm/query/query_condition.cc @@ -1329,7 +1329,9 @@ Status QueryCondition::apply( const uint64_t stride) const { #ifdef HAVE_RUST if (!tree_ && datafusion_.has_value()) { - throw QueryConditionException("TODO not supported"); + throw QueryConditionException( + "This query does not support predicates added with " + "tiledb_query_add_predicate"); } #endif From 7aaedd5c5432f48f7e20ad599e78bc809aadc0a7 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 08:37:31 -0400 Subject: [PATCH 06/52] Move datafusion session to query instead of ContextResources --- tiledb/sm/query/query.cc | 14 ++++++++++++-- tiledb/sm/query/query.h | 17 ++++++++++++++++- tiledb/sm/storage_manager/context_resources.cc | 10 +--------- tiledb/sm/storage_manager/context_resources.h | 18 ------------------ 4 files changed, 29 insertions(+), 30 deletions(-) diff --git a/tiledb/sm/query/query.cc b/tiledb/sm/query/query.cc index 7764f4acaa9..a41e5a70b40 100644 --- a/tiledb/sm/query/query.cc +++ b/tiledb/sm/query/query.cc @@ -1532,9 +1532,14 @@ Status Query::add_predicate(const char* predicate) { "initialized query is not supported.")); } +#ifdef HAVE_RUST try { - auto box_extern_expr = - resources_.session().parse_expr(predicate, array_schema()); + if (!session_.has_value()) { + session_.emplace( + std::move(tiledb::oxidize::datafusion::session::new_session())); + } + + auto box_extern_expr = (*session_)->parse_expr(predicate, array_schema()); auto extern_expr = box_extern_expr.into_raw(); // NB: Rust cxx does not have a way to have crate A construct and return @@ -1561,6 +1566,11 @@ Status Query::add_predicate(const char* predicate) { } return Status::Ok(); +#else + return logger_->status( + Status_QueryError("Cannot add query predicate: feature requires build " + "configuration '-DTILEDB_RUST=ON'")); +#endif } Status Query::add_update_value( diff --git a/tiledb/sm/query/query.h b/tiledb/sm/query/query.h index ec931b9e190..5834d716ba9 100644 --- a/tiledb/sm/query/query.h +++ b/tiledb/sm/query/query.h @@ -64,10 +64,18 @@ using namespace tiledb::common; -namespace tiledb::oxidize::datafusion::logical_expr { +namespace tiledb::oxidize::datafusion { + +namespace logical_expr { class LogicalExpr; } +namespace session { +class Session; +} + +} // namespace tiledb::oxidize::datafusion + namespace tiledb::sm { class Array; @@ -1038,8 +1046,15 @@ class Query { /** The query condition. */ std::optional condition_; +#ifdef HAVE_RUST + /** Datafusion context for parsing and evaluating predicates */ + std::optional> + session_; + + /** Predicates */ std::vector> predicates_; +#endif /** The update values. */ std::vector update_values_; diff --git a/tiledb/sm/storage_manager/context_resources.cc b/tiledb/sm/storage_manager/context_resources.cc index e70aa516da9..521ec52e551 100644 --- a/tiledb/sm/storage_manager/context_resources.cc +++ b/tiledb/sm/storage_manager/context_resources.cc @@ -34,10 +34,6 @@ #include "tiledb/common/memory_tracker.h" #include "tiledb/sm/rest/rest_client.h" -#ifdef HAVE_RUST -#include "tiledb/oxidize/session.h" -#endif - using namespace tiledb::common; namespace tiledb::sm { @@ -68,11 +64,7 @@ ContextResources::ContextResources( config_, compute_tp(), *logger_.get(), - create_memory_tracker())} -#ifdef HAVE_RUST - , session_(tiledb::oxidize::datafusion::session::new_session()) -#endif -{ + create_memory_tracker())} { ephemeral_memory_tracker_->set_type(MemoryTrackerType::EPHEMERAL); serialization_memory_tracker_->set_type(MemoryTrackerType::SERIALIZATION); diff --git a/tiledb/sm/storage_manager/context_resources.h b/tiledb/sm/storage_manager/context_resources.h index 70105b349d5..1ea083fdc2c 100644 --- a/tiledb/sm/storage_manager/context_resources.h +++ b/tiledb/sm/storage_manager/context_resources.h @@ -39,16 +39,8 @@ #include "tiledb/sm/filesystem/vfs.h" #include "tiledb/sm/stats/global_stats.h" -#ifdef HAVE_RUST -#include "tiledb/oxidize/rust.h" -#endif - using namespace tiledb::common; -namespace tiledb::oxidize::datafusion::session { -class Session; -} - namespace tiledb::sm { class MemoryTracker; @@ -123,12 +115,6 @@ class ContextResources { return *memory_tracker_manager_; } -#ifdef HAVE_RUST - const tiledb::oxidize::datafusion::session::Session& session() const { - return *session_; - } -#endif - /** * Create a new MemoryTracker * @@ -198,10 +184,6 @@ class ContextResources { /** The rest client (may be null if none was configured). */ shared_ptr rest_client_; - -#ifdef HAVE_RUST - rust::Box session_; -#endif }; } // namespace tiledb::sm From 3668cda5d7775f8ede73d006cff5d739c0431a23 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 09:02:53 -0400 Subject: [PATCH 07/52] Tweak example comment --- examples/c_api/query_add_predicate.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/c_api/query_add_predicate.c b/examples/c_api/query_add_predicate.c index 42a3ab1f87d..c3a168e2f64 100644 --- a/examples/c_api/query_add_predicate.c +++ b/examples/c_api/query_add_predicate.c @@ -27,8 +27,8 @@ * * @section DESCRIPTION * - * This example demonstrates using the `tiledb_query_add_predicate` API - * to add one or more text predicates to a query. This API parses a SQL + * This example demonstrates using the experimental `tiledb_query_add_predicate` + * API to add one or more text predicates to a query. This API parses a SQL * predicate and uses it to filter results inside of the storage engine * before returning them to the user. * @@ -38,7 +38,8 @@ * the same results. * * This example also has additional queries which use predicates which - * combine dimensions and attributes. + * combine dimensions and attributes, highlighting a capability which + * cannot be replicated by just subarrays and query conditions. */ #include From 4e2eebf6057a77d96f973abafb8914e076f29df0 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 11:27:48 -0400 Subject: [PATCH 08/52] Enumeration::create const std::vector --- tiledb/sm/cpp_api/enumeration_experimental.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tiledb/sm/cpp_api/enumeration_experimental.h b/tiledb/sm/cpp_api/enumeration_experimental.h index a9fe7474be5..79ed0250732 100644 --- a/tiledb/sm/cpp_api/enumeration_experimental.h +++ b/tiledb/sm/cpp_api/enumeration_experimental.h @@ -396,7 +396,7 @@ class Enumeration { static Enumeration create( const Context& ctx, const std::string& name, - std::vector& values, + const std::vector& values, bool ordered = false, std::optional type = std::nullopt) { using DataT = impl::TypeHandler; @@ -449,7 +449,7 @@ class Enumeration { static Enumeration create( const Context& ctx, const std::string& name, - std::vector>& values, + const std::vector>& values, bool ordered = false, std::optional type = std::nullopt) { using DataT = impl::TypeHandler; From bd16e7b1a3e4306829f573120e57933fff0be982 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 11:28:18 -0400 Subject: [PATCH 09/52] Tweak example --- examples/c_api/query_add_predicate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/c_api/query_add_predicate.c b/examples/c_api/query_add_predicate.c index c3a168e2f64..2c64994aa9f 100644 --- a/examples/c_api/query_add_predicate.c +++ b/examples/c_api/query_add_predicate.c @@ -452,12 +452,12 @@ int main() { printf("\n"); // Execute a read query with query condition `b < "eve"`. - printf("SELECT * WHERE b < 'eve'\n"); + printf("WHERE b < 'eve'\n"); RETURN_IF_NOT_OK(read_array_with_predicate(ctx, "b < 'eve'")); printf("\n"); // Execute a read query with query condition `c >= 1`. - printf("SELECT * WHERE c >= 1\n"); + printf("WHERE c >= 1\n"); RETURN_IF_NOT_OK(read_array_with_predicate(ctx, "c >= 1")); printf("\n"); @@ -476,7 +476,7 @@ int main() { // BEGIN EXAMPLES WITH ENUMERATIONS printf("WHERE e = 'california'\n"); { - // error is expected since the enumeration is not loaded + // error is expected as enumerations are not supported yet const int32_t ret = read_array_with_predicate(ctx, "e = 'california'"); if (ret != TILEDB_ERR) { return TILEDB_ERR; From 40cb8944956d2d412ad56083b2fcd94359ce6c37 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 11:28:34 -0400 Subject: [PATCH 10/52] cpp example --- examples/cpp_api/query_add_predicate.cc | 348 ++++++++++++++++++++++++ 1 file changed, 348 insertions(+) create mode 100644 examples/cpp_api/query_add_predicate.cc diff --git a/examples/cpp_api/query_add_predicate.cc b/examples/cpp_api/query_add_predicate.cc new file mode 100644 index 00000000000..5314c7564d8 --- /dev/null +++ b/examples/cpp_api/query_add_predicate.cc @@ -0,0 +1,348 @@ +/** + * @file query_add_predicate.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2025 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This example demonstrates using the `QueryExperimental::add_predicate` + * API to add one or more text predicates to a query. This API parses a SQL + * predicate and uses it to filter results inside of the storage engine + * before returning them to the user. + * + * The array used in this example is identical to that of the + * `query_condition_sparse` example. The first group of predicates which + * run are text equivalents of the predicates in that example, and produce + * the same results. + * + * This example also has additional queries which use predicates which + * combine dimensions and attributes, highlighting a capability which + * cannot be replicated by just subarrays and query conditions. + */ + +#include +#include +#include +#include +#include + +using namespace tiledb; + +// Name of array. +std::string array_name("array_query_add_predicate"); + +// Enumeration variants +const std::vector us_states = { + "alabama", + "alaska", + "arizona", + "arkansas", + "california", + "colorado", + "connecticut", + "etc"}; + +/** + * @brief Function to print the values of all the attributes for one + * index of this array. + * + * @param a Attribute a's value. + * @param b Attribute b's value. + * @param c Attribute c's value. + * @param d Attribute d's value. + */ +void print_elem( + std::optional a, + std::string b, + int32_t c, + float d, + std::optional e) { + std::cout << "{" << (a.has_value() ? std::to_string(a.value()) : "null") + << ", " << b << ", " << c << ", " << d << ", " + << (e.has_value() ? + (e.value() < us_states.size() ? + us_states[e.value()] : + "(invalid key " + std::to_string(e.value()) + ")") : + "null") + << "}" << std::endl; +} + +/** + * @brief Function to create the TileDB array used in this example. + * The array will be 1D with size 1 with dimension "index". + * The bounds on the index will be 0 through 9, inclusive. + * + * The array has four attributes. The four attributes are + * - "a" (type int) + * - "b" (type std::string) + * - "c" (type int32_t) + * - "d" (type float) + * + * @param ctx The context. + */ +void create_array(Context& ctx) { + // Creating the domain and the dimensions. + Domain domain(ctx); + domain.add_dimension(Dimension::create(ctx, "index", {{0, 9}})); + + // The array will be sparse. + ArraySchema schema(ctx, TILEDB_SPARSE); + schema.set_domain(domain).set_order({{TILEDB_ROW_MAJOR}}); + + // Adding the attributes of the array to the array schema. + Attribute a = Attribute::create(ctx, "a").set_nullable(true); + schema.add_attribute(a) + .add_attribute(Attribute::create(ctx, "b")) + .add_attribute(Attribute::create(ctx, "c")) + .add_attribute(Attribute::create(ctx, "d")); + + // Create enumeration and an attribute using it + ArraySchemaExperimental::add_enumeration( + ctx, + schema, + Enumeration::create(ctx, std::string("us_states"), us_states)); + + { + auto e = Attribute::create(ctx, "e").set_nullable(true); + AttributeExperimental::set_enumeration_name(ctx, e, "us_states"); + schema.add_attribute(e); + } + + // Create the (empty) array. + Array::create(ctx, array_name, schema); +} + +/** + * @brief Execute a write on array query_condition_sparse array + * which then stores the following data in the array. The table + * is organized by dimension/attribute. + * + * index | a | b | c | d + * ------------------------------- + * 0 | null | alice | 0 | 4.1 + * 1 | 2 | bob | 0 | 3.4 + * 2 | null | craig | 0 | 5.6 + * 3 | 4 | dave | 0 | 3.7 + * 4 | null | erin | 0 | 2.3 + * 5 | 6 | frank | 0 | 1.7 + * 6 | null | grace | 1 | 3.8 + * 7 | 8 | heidi | 2 | 4.9 + * 8 | null | ivan | 3 | 3.2 + * 9 | 10 | judy | 4 | 3.1 + * + * @param ctx The context. + */ +void write_array(Context& ctx) { + // Create data buffers that store the values to be written in. + std::vector dim_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + std::vector a_data = {0, 2, 0, 4, 0, 6, 0, 8, 0, 10}; + std::vector a_data_validity = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; + std::vector b_strs = { + "alice", + "bob", + "craig", + "dave", + "erin", + "frank", + "grace", + "heidi", + "ivan", + "judy"}; + std::string b_data = ""; + std::vector b_data_offsets; + for (const auto& elem : b_strs) { + b_data_offsets.push_back(b_data.size()); + b_data += elem; + } + std::vector c_data = {0, 0, 0, 0, 0, 0, 1, 2, 3, 4}; + std::vector d_data = { + 4.1, 3.4, 5.6, 3.7, 2.3, 1.7, 3.8, 4.9, 3.2, 3.1}; + + std::vector e_keys = {2, 7, 5, 6, 100, 3, 7, 7, 5, 4}; + std::vector e_validity = {1, 1, 1, 1, 0, 1, 1, 1, 1, 1}; + + // Execute the write query. + Array array_w(ctx, array_name, TILEDB_WRITE); + Query query_w(ctx, array_w); + query_w.set_layout(TILEDB_UNORDERED) + .set_data_buffer("index", dim_data) + .set_data_buffer("a", a_data) + .set_validity_buffer("a", a_data_validity) + .set_data_buffer("b", b_data) + .set_offsets_buffer("b", b_data_offsets) + .set_data_buffer("c", c_data) + .set_data_buffer("d", d_data) + .set_data_buffer("e", e_keys) + .set_validity_buffer("e", e_validity); + + query_w.submit(); + query_w.finalize(); + array_w.close(); +} + +/** + * @brief Executes the read query for the array created in write_array. + * + * @param ctx The context. + * @param qc The query condition to execute the query with. + */ +void read_array_with_predicates( + Context& ctx, std::vector predicates) { + const unsigned reserve_cells = 16; + + // Create data buffers to read the values into. + std::vector a_data(reserve_cells); + std::vector a_data_validity(reserve_cells); + + // We initialize the string b_data to have enough space to + // contain the total length of all of the strings written + // into attribute b + std::string b_data; + b_data.resize(256); + + std::vector b_data_offsets(reserve_cells); + std::vector c_data(reserve_cells); + std::vector d_data(reserve_cells); + std::vector e_keys(reserve_cells); + std::vector e_validity(reserve_cells); + + // Execute the read query. + Array array(ctx, array_name, TILEDB_READ); + Query query(ctx, array); + query.set_layout(TILEDB_GLOBAL_ORDER) + .set_data_buffer("a", a_data) + .set_validity_buffer("a", a_data_validity) + .set_data_buffer("b", b_data) + .set_offsets_buffer("b", b_data_offsets) + .set_data_buffer("c", c_data) + .set_data_buffer("d", d_data) + .set_data_buffer("e", e_keys) + .set_validity_buffer("e", e_validity); + + for (const auto& predicate : predicates) { + QueryExperimental::add_predicate(ctx, query, predicate.c_str()); + } + + query.submit(); + + // Collect the results of the read query. The number of elements + // the filtered array contains is in num_elements_result. + // The length of the filtered substring of all the data is in + // b_data, and all the offsets for filtered individual elements + // are in b_data_offsets. + auto table = query.result_buffer_elements_nullable(); + size_t num_elements_result = std::get<1>(table["c"]); + uint64_t b_str_length = std::get<1>(table["b"]); + b_data_offsets.push_back(b_str_length); + + // Here we print all the elements that are returned by the query. + for (size_t i = 0; i < num_elements_result; ++i) { + // We pass in nullopt if the data is invalid, per the validity buffer. + print_elem( + (a_data_validity[i] ? std::optional{a_data[i]} : std::nullopt), + b_data.substr( + b_data_offsets[i], b_data_offsets[i + 1] - b_data_offsets[i]), + c_data[i], + d_data[i], + (e_validity[i] ? std::optional{e_keys[i]} : std::nullopt)); + } + + query.finalize(); + array.close(); +} + +int main() { + // Create the context. + Context ctx; + VFS vfs(ctx); + if (!vfs.is_dir(array_name)) { + // Create and write data to the array. + create_array(ctx); + write_array(ctx); + } + + // EXAMPLES FROM query_condition_sparse.cc EXAMPLE + + // Printing the entire array. + std::cout << "WHERE TRUE" << std::endl; + read_array_with_predicates(ctx, {}); + std::cout << std::endl; + + // Execute a read query with query condition `a = null`. + std::cout << "WHERE a IS NULL" << std::endl; + read_array_with_predicates(ctx, {"a IS NULL"}); + std::cout << std::endl; + + // Execute a read query with query condition `b < "eve"`. + std::cout << "WHERE b < 'eve'" << std::endl; + read_array_with_predicates(ctx, {"b < 'eve'"}); + std::cout << std::endl; + + // Execute a read query with query condition `c >= 1`. + std::cout << "WHERE c >= 1" << std::endl; + read_array_with_predicates(ctx, {"c >= 1"}); + std::cout << std::endl; + + // Execute a read query with query condition `3.0f <= d AND d <= 4.0f`. + std::cout << "WHERE d BETWEEN 3.0 AND 4.0" << std::endl; + QueryCondition qc3(ctx); + read_array_with_predicates(ctx, {"d BETWEEN 3.0 AND 4.0"}); + std::cout << std::endl; + + // Execute a read query with query condition `3.0f <= d AND d <= 4.0f AND a != + // null AND b < \"eve\"`. + std::cout << "WHERE d BETWEEN 3.0 AND 4.0 AND a IS NOT NULL AND b < 'eve'" + << std::endl; + read_array_with_predicates( + ctx, {"d BETWEEN 3.0 AND 4.0", "a IS NOT NULL", "b < 'eve'"}); + std::cout << std::endl; + + // BEGIN EXAMPLES WITH ENUMERATIONS + // error is expected as enumerations are not supported yet + std::cout << "WHERE e = 'california'" << std::endl; + try { + read_array_with_predicates(ctx, {"e = 'california'"}); + // should not get here + return TILEDB_ERR; + } catch (const std::exception& e) { + std::cout << e.what() << std::endl; + } + std::cout << std::endl; + + // BEGIN EXAMPLES WITH NO EQUIVALENT + // these examples cannot be expressed using subarray + query condition + + // query condition does not have functions, here we use coalesce + std::cout << "WHERE coalesce(a, 2) + c < index" << std::endl; + read_array_with_predicates(ctx, {"coalesce(a, 2) + c < index"}); + std::cout << std::endl; + + // FIXME: this is query-condition-able, use arithmetic + std::cout << "WHERE a > 6 OR a IS NULL" << std::endl; + read_array_with_predicates(ctx, {"a > 6 OR a IS NULL"}); + std::cout << std::endl; + + return 0; +} From 21392974163f3a604469af9d690d76e758c23e6f Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 14:10:34 -0400 Subject: [PATCH 11/52] clippy --- tiledb/oxidize/arrow/src/record_batch.rs | 2 +- tiledb/oxidize/arrow/src/schema.rs | 2 ++ tiledb/oxidize/session/src/lib.rs | 6 ++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tiledb/oxidize/arrow/src/record_batch.rs b/tiledb/oxidize/arrow/src/record_batch.rs index 095546cb72e..dd13b00c2a6 100644 --- a/tiledb/oxidize/arrow/src/record_batch.rs +++ b/tiledb/oxidize/arrow/src/record_batch.rs @@ -267,7 +267,7 @@ unsafe fn to_arrow_array( // it will require some refactoring so that we build the enumeration // ArrowArrays just once for the whole query, in addition to the // issues with regards to the enumeration being loaded - return Err(FieldError::EnumerationNotSupported); + Err(FieldError::EnumerationNotSupported) } _ => { // SAFETY: ensured by limited range of return values of `crate::schema::arrow_datatype` diff --git a/tiledb/oxidize/arrow/src/schema.rs b/tiledb/oxidize/arrow/src/schema.rs index 72106ea0d31..de8160d1814 100644 --- a/tiledb/oxidize/arrow/src/schema.rs +++ b/tiledb/oxidize/arrow/src/schema.rs @@ -52,6 +52,8 @@ pub mod cxx { /// Returns a [Schema] which represents the physical field types of /// the fields from `array_schema` which are contained in `select`. + // NB: we use `Vec` for facilitating the FFI boundary + #[allow(clippy::ptr_arg)] pub fn project_arrow( array_schema: &ArraySchema, select: &Vec, diff --git a/tiledb/oxidize/session/src/lib.rs b/tiledb/oxidize/session/src/lib.rs index 69dbf5c24f3..9333f5f2bc7 100644 --- a/tiledb/oxidize/session/src/lib.rs +++ b/tiledb/oxidize/session/src/lib.rs @@ -99,3 +99,9 @@ impl Session { .map_err(ParseExprError::TypeCoercion) } } + +impl Default for Session { + fn default() -> Self { + Self::new() + } +} From f10d1573d9151fc0f62e268e3b9c187769fcb5b6 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 15:26:49 -0400 Subject: [PATCH 12/52] Add test on evolved schema --- test/src/unit-query-add-predicate.cc | 210 +++++++++++++++++++-------- test/support/src/array_templates.h | 3 + 2 files changed, 151 insertions(+), 62 deletions(-) diff --git a/test/src/unit-query-add-predicate.cc b/test/src/unit-query-add-predicate.cc index a75f80c291c..d9d105f728f 100644 --- a/test/src/unit-query-add-predicate.cc +++ b/test/src/unit-query-add-predicate.cc @@ -1,5 +1,5 @@ /** - * @file unit-capi-query-add-predicate.cc + * @file unit-query-add-predicate.cc * * @section LICENSE * @@ -83,39 +83,54 @@ struct QueryAddPredicateFx { bool allow_dups = false); /** - * Writes cells to saturate the ranges [[1, 4], [1, 4]] for an array + * Writes cells to a sparse array using the data in `input` + */ + template + void write_array(const std::string& path, const F& input = INPUT); + + /** + * Writes `INPUT` to saturate the ranges [[1, 4], [1, 4]] for an array * of the schema given above */ - void write_array( - const std::string& path, - tiledb_array_type_t atype, - const Cells input = INPUT); + void write_array_dense(const std::string& path); - Cells query_array( + template + F query_array( const std::string& path, tiledb_layout_t layout, std::vector predicates, const Config& query_config = Config()); - Cells query_array( + template + F query_array( const std::string& path, tiledb_layout_t layout, const char* predicate) { - return query_array(path, layout, std::vector{predicate}); + return query_array(path, layout, std::vector{predicate}); } }; +template +static F make_cells_generic( + std::vector d1, + std::vector d2, + std::vector... atts) { + return F{ + .d1_ = templates::query_buffers(d1), + .d2_ = templates::query_buffers(d2), + .atts_ = std::apply( + [](std::vector... att) { + return std::make_tuple...>( + templates::query_buffers(att)...); + }, + std::make_tuple(atts...))}; +} + static Cells make_cells( std::vector d1, std::vector d2, std::vector> a, std::vector v, std::vector> e) { - return Cells{ - .d1_ = templates::query_buffers(d1), - .d2_ = templates::query_buffers(d2), - .atts_ = std::make_tuple( - templates::query_buffers>(a), - templates::query_buffers(v), - templates::query_buffers>(e))}; + return make_cells_generic(d1, d2, a, v, e); } const Cells QueryAddPredicateFx::INPUT = make_cells( @@ -210,47 +225,52 @@ void QueryAddPredicateFx::create_array( Array::create(path, schema); } -void QueryAddPredicateFx::write_array( - const std::string& path, tiledb_array_type_t atype, const Cells input) { +template +void QueryAddPredicateFx::write_array(const std::string& path, const F& input) { auto ctx = context(); Array array(ctx, path, TILEDB_WRITE); Query query(ctx, array); - if (atype == TILEDB_DENSE) { - Subarray s(ctx, array); - s.add_range(0, 1, 4); - s.add_range(1, 1, 4); - query.set_layout(TILEDB_ROW_MAJOR).set_subarray(s); + auto field_sizes = + templates::query::make_field_sizes(const_cast(input)); + templates::query::set_fields( + ctx.ptr().get(), + query.ptr().get(), + field_sizes, + const_cast(input), + array.ptr().get()->array_schema_latest()); + query.submit(); +} - templates::Fragment< - std::optional, - std::vector, - std::optional> - cells = {.atts_ = input.atts_}; - - auto field_sizes = templates::query::make_field_sizes(cells); - templates::query::set_fields( - ctx.ptr().get(), - query.ptr().get(), - field_sizes, - cells, - array.ptr().get()->array_schema_latest()); - - query.submit(); - } else { - auto field_sizes = - templates::query::make_field_sizes(const_cast(input)); - templates::query::set_fields( - ctx.ptr().get(), - query.ptr().get(), - field_sizes, - const_cast(input), - array.ptr().get()->array_schema_latest()); - query.submit(); - } +void QueryAddPredicateFx::write_array_dense(const std::string& path) { + auto ctx = context(); + Array array(ctx, path, TILEDB_WRITE); + Query query(ctx, array); + + Subarray s(ctx, array); + s.add_range(0, 1, 4); + s.add_range(1, 1, 4); + query.set_layout(TILEDB_ROW_MAJOR).set_subarray(s); + + templates::Fragment< + std::optional, + std::vector, + std::optional> + cells = {.atts_ = INPUT.atts_}; + + auto field_sizes = templates::query::make_field_sizes(cells); + templates::query::set_fields( + ctx.ptr().get(), + query.ptr().get(), + field_sizes, + cells, + array.ptr().get()->array_schema_latest()); + + query.submit(); } -Cells QueryAddPredicateFx::query_array( +template +F QueryAddPredicateFx::query_array( const std::string& path, tiledb_layout_t layout, std::vector predicates, @@ -262,7 +282,7 @@ Cells QueryAddPredicateFx::query_array( query.set_config(config).set_layout(layout); - Cells out; + F out; out.resize(32); auto field_sizes = @@ -302,7 +322,7 @@ TEST_CASE_METHOD( vfs_test_setup_.array_uri("test_query_add_predicate_errors"); create_array(array_name, TILEDB_SPARSE); - write_array(array_name, TILEDB_SPARSE); + write_array(array_name); auto ctx = context(); @@ -389,12 +409,12 @@ TEST_CASE_METHOD( TEST_CASE_METHOD( QueryAddPredicateFx, "C API: Test query add predicate dense", - "[capi][query][add_predicate]") { + "[query][add_predicate]") { const std::string array_name = vfs_test_setup_.array_uri("test_query_add_predicate_dense"); create_array(array_name, TILEDB_DENSE); - write_array(array_name, TILEDB_DENSE); + write_array_dense(array_name); // FIXME: error messages REQUIRE_THROWS(query_array(array_name, TILEDB_UNORDERED, "row >= 3")); @@ -407,12 +427,12 @@ TEST_CASE_METHOD( TEST_CASE_METHOD( QueryAddPredicateFx, "C API: Test query add predicate sparse unsupported query order", - "[capi][query][add_predicate]") { + "[query][add_predicate]") { const std::string array_name = vfs_test_setup_.array_uri("test_query_add_predicate_sparse_unsupported"); create_array(array_name, TILEDB_SPARSE); - write_array(array_name, TILEDB_SPARSE); + write_array(array_name); const auto match = Catch::Matchers::ContainsSubstring( "This query does not support predicates added with " @@ -443,14 +463,14 @@ TEST_CASE_METHOD( TEST_CASE_METHOD( QueryAddPredicateFx, "C API: Test query add predicate sparse global order", - "[capi][query][add_predicate]") { + "[query][add_predicate]") { const std::string array_name = vfs_test_setup_.array_uri("test_query_add_predicate_sparse_global_order"); const auto query_order = GENERATE(TILEDB_GLOBAL_ORDER, TILEDB_UNORDERED); create_array(array_name, TILEDB_SPARSE); - write_array(array_name, TILEDB_SPARSE); + write_array(array_name); SECTION("WHERE TRUE") { const auto result = query_array(array_name, query_order, "TRUE"); @@ -569,7 +589,7 @@ TEST_CASE_METHOD( TEST_CASE_METHOD( QueryAddPredicateFx, "C API: Test query add predicate sparse unordered with dups", - "[capi][query][add_predicate]") { + "[query][add_predicate]") { const std::string array_name = vfs_test_setup_.array_uri( "test_query_add_predicate_sparse_unordered_with_dups"); @@ -591,9 +611,9 @@ TEST_CASE_METHOD( {7, 0, 6, std::nullopt, 1, 5, std::nullopt, 2}); // fragment 1: base input - write_array(array_name, TILEDB_SPARSE); - write_array(array_name, TILEDB_SPARSE, f2); - write_array(array_name, TILEDB_SPARSE, f3); + write_array(array_name); + write_array(array_name, f2); + write_array(array_name, f3); SECTION("WHERE TRUE") { const Cells expect = templates::query::concat({INPUT, f2, f3}); @@ -678,3 +698,69 @@ TEST_CASE_METHOD( "predicates")); } } + +/** + * Test that we do something reasonable when evaluating a predicate + * on an array whose schema evolved to have a different type for the + * same attribute + */ +TEST_CASE_METHOD( + QueryAddPredicateFx, + "C API: Test query add predicate on evolved schema with different type", + "[query][add_predicate]") { + const std::string array_name = + vfs_test_setup_.array_uri("test_query_add_predicate_evolution"); + + create_array(array_name, TILEDB_SPARSE); + write_array(array_name, INPUT); + + { + auto ctx = context(); + ArraySchemaEvolution(ctx).drop_attribute("a").array_evolve(array_name); + + ArraySchemaEvolution(ctx) + .add_attribute(Attribute::create(ctx, "a")) + .array_evolve(array_name); + } + + using CellsEvolved = templates::Fragment2D< + uint64_t, + uint64_t, + std::string, + std::optional, + std::string>; + + const CellsEvolved f2 = make_cells_generic< + CellsEvolved, + std::string, + std::optional, + std::string>( + {1, 2, 3, 4}, + {1, 2, 3, 4}, + {"seventeen", "eighteen", "nineteen", "twenty"}, + {0, 1, 2, 3}, + {"00", "01", "10", "11"}); + write_array(array_name, f2); + + SECTION("WHERE a LIKE '%1'") { + CellsEvolved expect = make_cells_generic< + CellsEvolved, + std::string, + std::optional, + std::string>( + {2, 4}, {2, 4}, {"eighteen", "twenty"}, {1, 3}, {"01", "11"}); + + const auto result = query_array( + array_name, TILEDB_GLOBAL_ORDER, "a LIKE '%1'"); + CHECK(result == expect); + } + + SECTION("WHERE a & 1 = 0") { + REQUIRE_THROWS_WITH( + query_array(array_name, TILEDB_GLOBAL_ORDER, "a & 1 = 0"), + Catch::Matchers::ContainsSubstring( + "Error: Error adding predicate: Type coercion error: Error during " + "planning: Cannot infer common type for bitwise operation " + "LargeUtf8 & Int64")); + } +} diff --git a/test/support/src/array_templates.h b/test/support/src/array_templates.h index ad5dc56f660..5d1aed860bf 100644 --- a/test/support/src/array_templates.h +++ b/test/support/src/array_templates.h @@ -1147,6 +1147,9 @@ struct query_buffers>> { */ template <> struct query_buffers : public query_buffers> { + query_buffers() { + } + query_buffers(std::vector cells) { for (const auto& cell : cells) { offsets_.push_back(values_.size()); From 2b72ba4cc6dd0bcc818706015a8a1fe0d6e38ecb Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 15:28:06 -0400 Subject: [PATCH 13/52] Change test names --- test/src/unit-query-add-predicate.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test/src/unit-query-add-predicate.cc b/test/src/unit-query-add-predicate.cc index d9d105f728f..7a8f6b04ead 100644 --- a/test/src/unit-query-add-predicate.cc +++ b/test/src/unit-query-add-predicate.cc @@ -27,7 +27,7 @@ * * @section DESCRIPTION * - * Tests for the C API tiledb_query_add_predicate. + * Tests for the `tiledb_query_add_predicate` API. */ #include @@ -316,7 +316,7 @@ F QueryAddPredicateFx::query_array( TEST_CASE_METHOD( QueryAddPredicateFx, - "C API: Test query add predicate errors", + "Query add predicate errors", "[capi][query][add_predicate]") { const std::string array_name = vfs_test_setup_.array_uri("test_query_add_predicate_errors"); @@ -408,7 +408,7 @@ TEST_CASE_METHOD( TEST_CASE_METHOD( QueryAddPredicateFx, - "C API: Test query add predicate dense", + "Query add predicate dense array", "[query][add_predicate]") { const std::string array_name = vfs_test_setup_.array_uri("test_query_add_predicate_dense"); @@ -426,7 +426,7 @@ TEST_CASE_METHOD( TEST_CASE_METHOD( QueryAddPredicateFx, - "C API: Test query add predicate sparse unsupported query order", + "Query add predicate sparse unsupported query order", "[query][add_predicate]") { const std::string array_name = vfs_test_setup_.array_uri("test_query_add_predicate_sparse_unsupported"); @@ -462,7 +462,7 @@ TEST_CASE_METHOD( TEST_CASE_METHOD( QueryAddPredicateFx, - "C API: Test query add predicate sparse global order", + "Query add predicate sparse global order", "[query][add_predicate]") { const std::string array_name = vfs_test_setup_.array_uri("test_query_add_predicate_sparse_global_order"); @@ -588,7 +588,7 @@ TEST_CASE_METHOD( TEST_CASE_METHOD( QueryAddPredicateFx, - "C API: Test query add predicate sparse unordered with dups", + "Query add predicate sparse unordered with dups", "[query][add_predicate]") { const std::string array_name = vfs_test_setup_.array_uri( "test_query_add_predicate_sparse_unordered_with_dups"); @@ -706,7 +706,7 @@ TEST_CASE_METHOD( */ TEST_CASE_METHOD( QueryAddPredicateFx, - "C API: Test query add predicate on evolved schema with different type", + "Query add predicate evolved schema", "[query][add_predicate]") { const std::string array_name = vfs_test_setup_.array_uri("test_query_add_predicate_evolution"); From 1d962ecd0dd903b0ec03da9098338d50f7125177 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 15:50:59 -0400 Subject: [PATCH 14/52] Fix non-rust build --- tiledb/sm/query/query.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tiledb/sm/query/query.cc b/tiledb/sm/query/query.cc index a41e5a70b40..e96768e7227 100644 --- a/tiledb/sm/query/query.cc +++ b/tiledb/sm/query/query.cc @@ -729,6 +729,7 @@ void Query::init() { fragment_name_)); } +#ifdef HAVE_RUST if (!predicates_.empty()) { try { // treat existing query condition (if any) as datafusion @@ -749,6 +750,7 @@ void Query::init() { "Error initializing predicates: " + std::string(e.what())); } } +#endif // Create the query strategy if querying main array and the Subarray does // not need to be updated. @@ -1520,7 +1522,7 @@ Status Query::set_condition(const QueryCondition& condition) { return Status::Ok(); } -Status Query::add_predicate(const char* predicate) { +Status Query::add_predicate([[maybe_unused]] const char* predicate) { if (type_ != QueryType::READ) { return logger_->status( Status_QueryError("Cannot add query predicate; Operation only " From b0d36d39b3b6a21e15c4cb46c64705aded03e1c5 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 15:56:07 -0400 Subject: [PATCH 15/52] Fix osx build errors --- tiledb/sm/query/query.cc | 3 +-- tiledb/sm/query/query.h | 4 ++-- tiledb/sm/query/query_condition.cc | 2 +- tiledb/sm/query/query_condition.h | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/tiledb/sm/query/query.cc b/tiledb/sm/query/query.cc index e96768e7227..acb56b4fa05 100644 --- a/tiledb/sm/query/query.cc +++ b/tiledb/sm/query/query.cc @@ -1537,8 +1537,7 @@ Status Query::add_predicate([[maybe_unused]] const char* predicate) { #ifdef HAVE_RUST try { if (!session_.has_value()) { - session_.emplace( - std::move(tiledb::oxidize::datafusion::session::new_session())); + session_.emplace(tiledb::oxidize::datafusion::session::new_session()); } auto box_extern_expr = (*session_)->parse_expr(predicate, array_schema()); diff --git a/tiledb/sm/query/query.h b/tiledb/sm/query/query.h index 5834d716ba9..2382570127d 100644 --- a/tiledb/sm/query/query.h +++ b/tiledb/sm/query/query.h @@ -67,11 +67,11 @@ using namespace tiledb::common; namespace tiledb::oxidize::datafusion { namespace logical_expr { -class LogicalExpr; +struct LogicalExpr; } namespace session { -class Session; +struct Session; } } // namespace tiledb::oxidize::datafusion diff --git a/tiledb/sm/query/query_condition.cc b/tiledb/sm/query/query_condition.cc index ab346a77c67..e7dab0c6391 100644 --- a/tiledb/sm/query/query_condition.cc +++ b/tiledb/sm/query/query_condition.cc @@ -190,7 +190,7 @@ QueryCondition::as_datafusion(const ArraySchema& array_schema) { bool QueryCondition::rewrite_to_datafusion(const ArraySchema& array_schema) { if (!datafusion_.has_value()) { try { - datafusion_.emplace(array_schema, std::move(as_datafusion(array_schema))); + datafusion_.emplace(array_schema, as_datafusion(array_schema)); } catch (const ::rust::Error& e) { throw QueryConditionException( "Error compiling expression: " + std::string(e.what())); diff --git a/tiledb/sm/query/query_condition.h b/tiledb/sm/query/query_condition.h index 66c9b458d4d..ac71d438b65 100644 --- a/tiledb/sm/query/query_condition.h +++ b/tiledb/sm/query/query_condition.h @@ -51,7 +51,7 @@ namespace tiledb::oxidize::arrow::schema { struct ArrowSchema; } namespace tiledb::oxidize::datafusion::logical_expr { -class LogicalExpr; +struct LogicalExpr; } namespace tiledb::oxidize::datafusion::physical_expr { struct PhysicalExpr; From f246bd70bb147b2650094b60bdfa403c10daa32a Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 16:14:32 -0400 Subject: [PATCH 16/52] Fix C API example print_elem buffer --- examples/c_api/query_add_predicate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/c_api/query_add_predicate.c b/examples/c_api/query_add_predicate.c index 2c64994aa9f..7e6dbe63eba 100644 --- a/examples/c_api/query_add_predicate.c +++ b/examples/c_api/query_add_predicate.c @@ -93,7 +93,7 @@ static const char* const states[] = { */ void print_elem( int* a, char* b_start, int b_len, int32_t c, float d, uint8_t* e) { - char print_a[8], print_e[16]; + char print_a[8], print_e[32]; if (a == NULL) { strcpy(&print_a[0], "null"); } else { From 744728d542ab00443ed3f2bff84b559f45efc796 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 19:21:35 -0400 Subject: [PATCH 17/52] Comment new/updated test support functions --- test/support/src/array_templates.h | 18 ++++++++++++++++++ test/support/src/query_helpers.cc | 6 ++++++ 2 files changed, 24 insertions(+) diff --git a/test/support/src/array_templates.h b/test/support/src/array_templates.h index 5d1aed860bf..639b4268035 100644 --- a/test/support/src/array_templates.h +++ b/test/support/src/array_templates.h @@ -1158,6 +1158,10 @@ struct query_buffers : public query_buffers> { } }; +/** + * Dimension-less bundle of query buffers of the `Att...` types. + * Useful for dense array writes. + */ template struct Fragment { std::tuple...> atts_; @@ -1166,6 +1170,9 @@ struct Fragment { return std::get<0>(atts_).num_cells(); } + /** + * @return a tuple containing references to the dimension fields + */ std::tuple<> dimensions() const { return std::tuple<>(); } @@ -1455,6 +1462,10 @@ void set_fields( }(fragment.attributes()); } +/** + * Adds the buffers from `fragment` to a query, + * using `schema` to look up field names for the positional fields of `F`. + */ template void set_fields( tiledb_ctx_t* ctx, @@ -1484,6 +1495,10 @@ uint64_t num_cells(const F& fragment, const auto& field_sizes) { }(std::tuple_cat(fragment.dimensions(), fragment.attributes())); } +/** + * Resizes the fields of `fragment` using the query output field sizes + * `field_sizes`. + */ template void resize_fields(F& fragment, const auto& field_sizes) { std::apply( @@ -1497,6 +1512,9 @@ void resize_fields(F& fragment, const auto& field_sizes) { std::tuple_cat(fragment.dimensions(), fragment.attributes())); } +/** + * @return the concatenation of one or more fragments + */ template F concat(std::initializer_list fragments) { F concat; diff --git a/test/support/src/query_helpers.cc b/test/support/src/query_helpers.cc index de1bd690866..99170bc5216 100644 --- a/test/support/src/query_helpers.cc +++ b/test/support/src/query_helpers.cc @@ -37,6 +37,9 @@ namespace tiledb::test { using namespace tiledb::sm; +/** + * @return a SQL representation of a `QueryConditionOp` + */ static const char* to_sql_op(QueryConditionOp op) { switch (op) { case QueryConditionOp::LT: @@ -58,6 +61,9 @@ static const char* to_sql_op(QueryConditionOp op) { } } +/** + * @return a SQL representation of the query condition syntax tree + */ std::string to_sql(const ASTNode& ast, const ArraySchema& schema) { const ASTNodeVal* valnode = static_cast(&ast); const ASTNodeExpr* exprnode = dynamic_cast(&ast); From bead919c8af287b95757a318f0a79b8ed9407407 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 19:22:23 -0400 Subject: [PATCH 18/52] Remove unnecessary ExternType impl --- tiledb/oxidize/session/src/lib.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tiledb/oxidize/session/src/lib.rs b/tiledb/oxidize/session/src/lib.rs index 9333f5f2bc7..84041b19660 100644 --- a/tiledb/oxidize/session/src/lib.rs +++ b/tiledb/oxidize/session/src/lib.rs @@ -30,11 +30,6 @@ mod ffi { #[repr(transparent)] struct ExternLogicalExpr(pub LogicalExpr); -unsafe impl cxx::ExternType for ExternLogicalExpr { - type Id = cxx::type_id!("tiledb::oxidize::datafusion::logical_expr::LogicalExpr"); - type Kind = cxx::kind::Opaque; -} - fn new_session() -> Box { Box::new(Session::new()) } From 02fb0ca9c9f5131b07ce16fb317d74cdf6d0656e Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 19:33:12 -0400 Subject: [PATCH 19/52] Self-review code comments --- tiledb/oxidize/arrow/src/record_batch.rs | 3 ++- tiledb/oxidize/expr/src/logical_expr.rs | 1 + tiledb/oxidize/session/src/lib.rs | 12 ++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/tiledb/oxidize/arrow/src/record_batch.rs b/tiledb/oxidize/arrow/src/record_batch.rs index dd13b00c2a6..34e0a6688a0 100644 --- a/tiledb/oxidize/arrow/src/record_batch.rs +++ b/tiledb/oxidize/arrow/src/record_batch.rs @@ -236,7 +236,8 @@ unsafe fn to_arrow_array( }; let offsets = crate::offsets::try_from_bytes(1, fixed.as_slice())?; let values = unsafe { - // SAFETY: TODO add comment + // SAFETY: the caller is responsible that `fixed` out-lives + // the `Buffer` created here. See function docs. to_buffer::(var_tile) }?; diff --git a/tiledb/oxidize/expr/src/logical_expr.rs b/tiledb/oxidize/expr/src/logical_expr.rs index d805d5d649a..9a04f2928d5 100644 --- a/tiledb/oxidize/expr/src/logical_expr.rs +++ b/tiledb/oxidize/expr/src/logical_expr.rs @@ -67,6 +67,7 @@ impl Display for LogicalExpr { } } +/// Returns a conjunction of the logical exprs `e1 AND e2 AND ... AND eN`. pub fn make_conjunction(exprs: &[Box]) -> Box { Box::new(LogicalExpr( datafusion::logical_expr::utils::conjunction(exprs.iter().map(|e| e.0.clone())) diff --git a/tiledb/oxidize/session/src/lib.rs b/tiledb/oxidize/session/src/lib.rs index 84041b19660..e4b7f72a28c 100644 --- a/tiledb/oxidize/session/src/lib.rs +++ b/tiledb/oxidize/session/src/lib.rs @@ -27,6 +27,18 @@ mod ffi { } } +/// Wraps for `tiledb_expr::logical_expr::LogicalExpr`. +// This ideally would not be necessary but a weakness of cxx is that it does +// not recognize that the same Rust type in different crates (or even modules) +// can map to the same C++ type. +// +// See https://github.com/dtolnay/cxx/issues/1323 +// +// We can fortunately work around this as follows: +// 1) `#[repr(transparent)]` ensures that the wrapper type has the same +// underlying representation as the wrapped type +// 2) `rust::Box::into_raw` and `rust::Box::from_raw` on the C++ side which +// allow us to cast the wrapper type into the wrapped type. #[repr(transparent)] struct ExternLogicalExpr(pub LogicalExpr); From ec829e045f886ce0ee9e4281e11532e23fac20e7 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 19:37:59 -0400 Subject: [PATCH 20/52] Attempt to fix query_add_predicate error --- examples/cpp_api/query_add_predicate.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/cpp_api/query_add_predicate.cc b/examples/cpp_api/query_add_predicate.cc index 5314c7564d8..cdafad42f2e 100644 --- a/examples/cpp_api/query_add_predicate.cc +++ b/examples/cpp_api/query_add_predicate.cc @@ -255,7 +255,11 @@ void read_array_with_predicates( auto table = query.result_buffer_elements_nullable(); size_t num_elements_result = std::get<1>(table["c"]); uint64_t b_str_length = std::get<1>(table["b"]); - b_data_offsets.push_back(b_str_length); + if (num_elements_result < b_data_offsets.size()) { + b_data_offsets[num_elements_result] = b_str_length; + } else { + b_data_offsets.push_back(b_str_length); + } // Here we print all the elements that are returned by the query. for (size_t i = 0; i < num_elements_result; ++i) { From f2ddb4be14d9a885c6aeade96199753ab7e051b3 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 21:39:52 -0400 Subject: [PATCH 21/52] Undo clang-format-17 string splits --- tiledb/sm/query/query.cc | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tiledb/sm/query/query.cc b/tiledb/sm/query/query.cc index acb56b4fa05..e700ca21757 100644 --- a/tiledb/sm/query/query.cc +++ b/tiledb/sm/query/query.cc @@ -1962,11 +1962,9 @@ Status Query::create_strategy(bool skip_checks_serialization) { if (array_schema_->dense() && !all_dense) { LOG_WARN( "This dense array contains sparse fragments. Support for reading " - "sparse fragments in dense arrays will be removed in TileDB " - "version " + "sparse fragments in dense arrays will be removed in TileDB version " "2.27 to be released in September 2024. To make sure this array " - "continues to work after an upgrade to version 2.27 or later, " - "please " + "continues to work after an upgrade to version 2.27 or later, please " "consolidate the sparse fragments using a TileDB version 2.26 or " "earlier."); } @@ -2091,8 +2089,8 @@ Status Query::check_buffer_names() { "cells to be written")); } - // All attributes/dimensions must be provided unless this query is only - // for dimension labels. + // All attributes/dimensions must be provided unless this query is only for + // dimension labels. if (!only_dim_label_query() && !allow_separate_attribute_writes()) { auto expected_num = array_schema_->attribute_num(); expected_num += static_cast( From 176a0218b9a406b62864338aae2ff2b555c14db6 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 22:07:46 -0400 Subject: [PATCH 22/52] Change C++ API to use std::string --- examples/cpp_api/query_add_predicate.cc | 2 +- test/src/unit-query-add-predicate.cc | 77 +++++++++++++------------ tiledb/sm/cpp_api/query_experimental.h | 4 +- 3 files changed, 44 insertions(+), 39 deletions(-) diff --git a/examples/cpp_api/query_add_predicate.cc b/examples/cpp_api/query_add_predicate.cc index cdafad42f2e..a640974044d 100644 --- a/examples/cpp_api/query_add_predicate.cc +++ b/examples/cpp_api/query_add_predicate.cc @@ -242,7 +242,7 @@ void read_array_with_predicates( .set_validity_buffer("e", e_validity); for (const auto& predicate : predicates) { - QueryExperimental::add_predicate(ctx, query, predicate.c_str()); + QueryExperimental::add_predicate(ctx, query, predicate); } query.submit(); diff --git a/test/src/unit-query-add-predicate.cc b/test/src/unit-query-add-predicate.cc index 7a8f6b04ead..8ec548baebb 100644 --- a/test/src/unit-query-add-predicate.cc +++ b/test/src/unit-query-add-predicate.cc @@ -98,14 +98,8 @@ struct QueryAddPredicateFx { F query_array( const std::string& path, tiledb_layout_t layout, - std::vector predicates, + const std::vector& predicates, const Config& query_config = Config()); - - template - F query_array( - const std::string& path, tiledb_layout_t layout, const char* predicate) { - return query_array(path, layout, std::vector{predicate}); - } }; template @@ -273,7 +267,7 @@ template F QueryAddPredicateFx::query_array( const std::string& path, tiledb_layout_t layout, - std::vector predicates, + const std::vector& predicates, const Config& config) { auto ctx = context(); @@ -295,7 +289,7 @@ F QueryAddPredicateFx::query_array( out, array.ptr().get()->array_schema_latest()); - for (const char* pred : predicates) { + for (const std::string& pred : predicates) { QueryExperimental::add_predicate(ctx, query, pred); } @@ -331,7 +325,7 @@ TEST_CASE_METHOD( Query query(ctx, array); REQUIRE_THROWS_WITH( - QueryExperimental::add_predicate(ctx, query, "row BETWEEN 4 AND 7"), + QueryExperimental::add_predicate(ctx, query, {"row BETWEEN 4 AND 7"}), Catch::Matchers::ContainsSubstring( "Cannot add query predicate; Operation only applicable to read " "queries")); @@ -342,8 +336,13 @@ TEST_CASE_METHOD( Query query(ctx, array); SECTION("Null") { - REQUIRE_THROWS_WITH( - QueryExperimental::add_predicate(ctx, query, nullptr), + const auto maybe_err = error_if_any( + ctx.ptr().get(), + tiledb_query_add_predicate( + ctx.ptr().get(), query.ptr().get(), nullptr)); + REQUIRE(maybe_err.has_value()); + REQUIRE_THAT( + maybe_err.value(), Catch::Matchers::ContainsSubstring( "Argument \"predicate\" may not be NULL")); } @@ -353,7 +352,7 @@ TEST_CASE_METHOD( // If you dbg! the returned expr it prints `Expr::Column(Column { name: // "row" })` REQUIRE_THROWS_WITH( - QueryExperimental::add_predicate(ctx, query, "row col"), + QueryExperimental::add_predicate(ctx, query, {"row col"}), Catch::Matchers::ContainsSubstring( "Error: Expression does not return a boolean value")); } @@ -361,7 +360,7 @@ TEST_CASE_METHOD( SECTION("Non-expression") { REQUIRE_THROWS_WITH( QueryExperimental::add_predicate( - ctx, query, "CREATE TABLE foo (id INT)"), + ctx, query, {"CREATE TABLE foo (id INT)"}), Catch::Matchers::ContainsSubstring( "Error adding predicate: Parse error: SQL error: " "ParserError(\"Unsupported command in expression\")")); @@ -369,14 +368,14 @@ TEST_CASE_METHOD( SECTION("Not a predicate") { REQUIRE_THROWS_WITH( - QueryExperimental::add_predicate(ctx, query, "row"), + QueryExperimental::add_predicate(ctx, query, {"row"}), Catch::Matchers::ContainsSubstring( "Expression does not return a boolean value")); } SECTION("Schema error") { REQUIRE_THROWS_WITH( - QueryExperimental::add_predicate(ctx, query, "depth = 3"), + QueryExperimental::add_predicate(ctx, query, {"depth = 3"}), Catch::Matchers::ContainsSubstring( "Error adding predicate: Parse error: Schema error: No field " "named depth. Valid fields are row, col, a, v, e.")); @@ -393,13 +392,14 @@ TEST_CASE_METHOD( "caused by a bug in DataFusion's code and we would welcome that you " "file an bug report in our issue tracker"; REQUIRE_THROWS_WITH( - QueryExperimental::add_predicate(ctx, query, "starts_with(row, '1')"), + QueryExperimental::add_predicate( + ctx, query, {"starts_with(row, '1')"}), Catch::Matchers::ContainsSubstring(dferror)); } SECTION("Aggregate") { REQUIRE_THROWS_WITH( - QueryExperimental::add_predicate(ctx, query, "sum(row) >= 10"), + QueryExperimental::add_predicate(ctx, query, {"sum(row) >= 10"}), Catch::Matchers::ContainsSubstring( "Aggregate functions in predicate is not supported")); } @@ -417,11 +417,11 @@ TEST_CASE_METHOD( write_array_dense(array_name); // FIXME: error messages - REQUIRE_THROWS(query_array(array_name, TILEDB_UNORDERED, "row >= 3")); - REQUIRE_THROWS(query_array(array_name, TILEDB_ROW_MAJOR, "row >= 3")); - REQUIRE_THROWS(query_array(array_name, TILEDB_COL_MAJOR, "row >= 3")); - REQUIRE_THROWS(query_array(array_name, TILEDB_GLOBAL_ORDER, "row >= 3")); - REQUIRE_THROWS(query_array(array_name, TILEDB_HILBERT, "row >= 3")); + REQUIRE_THROWS(query_array(array_name, TILEDB_UNORDERED, {"row >= 3"})); + REQUIRE_THROWS(query_array(array_name, TILEDB_ROW_MAJOR, {"row >= 3"})); + REQUIRE_THROWS(query_array(array_name, TILEDB_COL_MAJOR, {"row >= 3"})); + REQUIRE_THROWS(query_array(array_name, TILEDB_GLOBAL_ORDER, {"row >= 3"})); + REQUIRE_THROWS(query_array(array_name, TILEDB_HILBERT, {"row >= 3"})); } TEST_CASE_METHOD( @@ -473,7 +473,7 @@ TEST_CASE_METHOD( write_array(array_name); SECTION("WHERE TRUE") { - const auto result = query_array(array_name, query_order, "TRUE"); + const auto result = query_array(array_name, query_order, {"TRUE"}); CHECK(result == INPUT); } @@ -505,7 +505,7 @@ TEST_CASE_METHOD( 7, std::nullopt}); - const auto result = query_array(array_name, query_order, "a IS NOT NULL"); + const auto result = query_array(array_name, query_order, {"a IS NOT NULL"}); CHECK(result == expect); } @@ -517,7 +517,8 @@ TEST_CASE_METHOD( {"four", "five", "eight", "eleven", "fifteen"}, {std::nullopt, 7, 0, 3, 7}); - const auto result = query_array(array_name, query_order, "v < 'fourteen'"); + const auto result = + query_array(array_name, query_order, {"v < 'fourteen'"}); CHECK(result == expect); } @@ -529,7 +530,8 @@ TEST_CASE_METHOD( {"one", "two", "three", "five", "six", "nine"}, {4, 4, 7, 7, 7, 1}); - const auto result = query_array(array_name, query_order, "row + col <= 4"); + const auto result = + query_array(array_name, query_order, {"row + col <= 4"}); CHECK(result == expect); } @@ -571,14 +573,14 @@ TEST_CASE_METHOD( std::nullopt}); const auto result = - query_array(array_name, query_order, "coalesce(a, row) > col"); + query_array(array_name, query_order, {"coalesce(a, row) > col"}); CHECK(result == expect); } SECTION("WHERE e < 'california'") { // enumeration not supported yet REQUIRE_THROWS_WITH( - query_array(array_name, query_order, "e < 'california'"), + query_array(array_name, query_order, {"e < 'california'"}), Catch::Matchers::ContainsSubstring( "QueryCondition: Error evaluating expression: Cannot process field " "'e': Attributes with enumerations are not supported in text " @@ -617,7 +619,7 @@ TEST_CASE_METHOD( SECTION("WHERE TRUE") { const Cells expect = templates::query::concat({INPUT, f2, f3}); - const auto result = query_array(array_name, query_order, "TRUE"); + const auto result = query_array(array_name, query_order, {"TRUE"}); CHECK(result == expect); } @@ -629,7 +631,8 @@ TEST_CASE_METHOD( {"four", "five", "eight", "eleven", "fifteen", "dos", "cinco"}, {std::nullopt, 7, 0, 3, 7, 0, 1}); - const auto result = query_array(array_name, query_order, "v < 'fourteen'"); + const auto result = + query_array(array_name, query_order, {"v < 'fourteen'"}); CHECK(result == expect); } @@ -663,7 +666,8 @@ TEST_CASE_METHOD( "cinco"}, {4, 4, 7, 7, 7, 1, 0, 1, 2, 7, 0, 1}); - const auto result = query_array(array_name, query_order, "row + col <= 4"); + const auto result = + query_array(array_name, query_order, {"row + col <= 4"}); CHECK(result == expect); } @@ -684,14 +688,14 @@ TEST_CASE_METHOD( const Cells expect = f2; const auto result = query_array( - array_name, query_order, "octet_length(v) > char_length(v)"); + array_name, query_order, {"octet_length(v) > char_length(v)"}); CHECK(result == expect); } SECTION("WHERE e < 'california'") { // enumeration not supported yet REQUIRE_THROWS_WITH( - query_array(array_name, query_order, "e < 'california'"), + query_array(array_name, query_order, {"e < 'california'"}), Catch::Matchers::ContainsSubstring( "QueryCondition: Error evaluating expression: Cannot process field " "'e': Attributes with enumerations are not supported in text " @@ -751,13 +755,14 @@ TEST_CASE_METHOD( {2, 4}, {2, 4}, {"eighteen", "twenty"}, {1, 3}, {"01", "11"}); const auto result = query_array( - array_name, TILEDB_GLOBAL_ORDER, "a LIKE '%1'"); + array_name, TILEDB_GLOBAL_ORDER, {"a LIKE '%1'"}); CHECK(result == expect); } SECTION("WHERE a & 1 = 0") { REQUIRE_THROWS_WITH( - query_array(array_name, TILEDB_GLOBAL_ORDER, "a & 1 = 0"), + query_array( + array_name, TILEDB_GLOBAL_ORDER, {"a & 1 = 0"}), Catch::Matchers::ContainsSubstring( "Error: Error adding predicate: Type coercion error: Error during " "planning: Cannot infer common type for bitwise operation " diff --git a/tiledb/sm/cpp_api/query_experimental.h b/tiledb/sm/cpp_api/query_experimental.h index 94cbd9abe91..245c6e02ee3 100644 --- a/tiledb/sm/cpp_api/query_experimental.h +++ b/tiledb/sm/cpp_api/query_experimental.h @@ -81,9 +81,9 @@ class QueryExperimental { * @param predicate A text representation of the desired predicate. */ static void add_predicate( - const Context& ctx, Query& query, const char* predicate) { + const Context& ctx, Query& query, const std::string& predicate) { ctx.handle_error(tiledb_query_add_predicate( - ctx.ptr().get(), query.ptr().get(), predicate)); + ctx.ptr().get(), query.ptr().get(), predicate.c_str())); } /** From d1c7680c674316d05927199447647e8ebc82ce20 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 22:10:32 -0400 Subject: [PATCH 23/52] Remove logger_->status --- tiledb/sm/query/query.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tiledb/sm/query/query.cc b/tiledb/sm/query/query.cc index e700ca21757..3a8a1ce3caf 100644 --- a/tiledb/sm/query/query.cc +++ b/tiledb/sm/query/query.cc @@ -1524,14 +1524,14 @@ Status Query::set_condition(const QueryCondition& condition) { Status Query::add_predicate([[maybe_unused]] const char* predicate) { if (type_ != QueryType::READ) { - return logger_->status( - Status_QueryError("Cannot add query predicate; Operation only " - "applicable to read queries")); + return Status_QueryError( + "Cannot add query predicate; Operation only " + "applicable to read queries"); } if (status_ != tiledb::sm::QueryStatus::UNINITIALIZED) { - return logger_->status(Status_QueryError( + return Status_QueryError( "Cannot add query predicate; Adding a predicate to an already " - "initialized query is not supported.")); + "initialized query is not supported."); } #ifdef HAVE_RUST @@ -1568,9 +1568,9 @@ Status Query::add_predicate([[maybe_unused]] const char* predicate) { return Status::Ok(); #else - return logger_->status( - Status_QueryError("Cannot add query predicate: feature requires build " - "configuration '-DTILEDB_RUST=ON'")); + return Status_QueryError( + "Cannot add query predicate: feature requires build " + "configuration '-DTILEDB_RUST=ON'"); #endif } From 653c89076660c710fbbd6db662e4d5bc65295429 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 1 Jul 2025 22:12:49 -0400 Subject: [PATCH 24/52] SQL dialect in API comments --- tiledb/sm/c_api/tiledb_experimental.h | 4 ++-- tiledb/sm/cpp_api/query_experimental.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tiledb/sm/c_api/tiledb_experimental.h b/tiledb/sm/c_api/tiledb_experimental.h index 4cf5468f25a..e3876670262 100644 --- a/tiledb/sm/c_api/tiledb_experimental.h +++ b/tiledb/sm/c_api/tiledb_experimental.h @@ -463,8 +463,8 @@ TILEDB_EXPORT int32_t tiledb_query_condition_set_use_enumeration( * will be analyzed and evaluated in the subarray step, query condition * step, or both. * - * The predicate is parsed as a SQL expression and must evaluate - * to a boolean. + * The predicate is parsed as an Apache DataFusion SQL expression and must + * evaluate to a boolean. * * **Example:** * diff --git a/tiledb/sm/cpp_api/query_experimental.h b/tiledb/sm/cpp_api/query_experimental.h index 245c6e02ee3..9ed9ba8f0cd 100644 --- a/tiledb/sm/cpp_api/query_experimental.h +++ b/tiledb/sm/cpp_api/query_experimental.h @@ -73,8 +73,8 @@ class QueryExperimental { * will be analyzed and evaluated in the subarray step, query condition * step, or both. * - * The predicate is parsed as a SQL expression and must evaluate - * to a boolean. + * The predicate is parsed as an Apache DataFusion SQL expression and must + * evaluate to a boolean. * * @param ctx The TileDB context. * @param query The TileDB query. From a00301c40b8a6d480fe3ab453a6f227b0057beaf Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Wed, 2 Jul 2025 09:14:11 -0400 Subject: [PATCH 25/52] Query add predicate to in progress query --- test/src/unit-query-add-predicate.cc | 40 ++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/test/src/unit-query-add-predicate.cc b/test/src/unit-query-add-predicate.cc index 8ec548baebb..6c9e93f83bc 100644 --- a/test/src/unit-query-add-predicate.cc +++ b/test/src/unit-query-add-predicate.cc @@ -406,6 +406,46 @@ TEST_CASE_METHOD( } } +TEST_CASE_METHOD( + QueryAddPredicateFx, + "Query add predicate to in progress query", + "[query][add_predicate]") { + const std::string array_name = + vfs_test_setup_.array_uri("test_query_add_predicate_in_progress"); + + create_array(array_name, TILEDB_SPARSE); + write_array(array_name); + + auto ctx = context(); + + Array array(ctx, array_name, TILEDB_READ); + Query query(ctx, array); + + query.set_layout(TILEDB_GLOBAL_ORDER); + + Cells out; + out.resize(INPUT.size() - 1); + + auto field_sizes = + templates::query::make_field_sizes(out, out.size()); + + templates::query::set_fields( + ctx.ptr().get(), + query.ptr().get(), + field_sizes, + out, + array.ptr().get()->array_schema_latest()); + + const auto st = query.submit(); + REQUIRE(st == Query::Status::INCOMPLETE); + + const auto expect_err = Catch::Matchers::ContainsSubstring( + "Cannot add query predicate; Adding a predicate to an already " + "initialized query is not supported."); + REQUIRE_THROWS_WITH( + QueryExperimental::add_predicate(ctx, query, "row = col"), expect_err); +} + TEST_CASE_METHOD( QueryAddPredicateFx, "Query add predicate dense array", From be32216b3f10758506bbf5586cc1d51da34fd250 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Wed, 2 Jul 2025 09:14:29 -0400 Subject: [PATCH 26/52] Fix bizarre -Warray-bounds error for b_data_offsets --- examples/cpp_api/query_add_predicate.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/cpp_api/query_add_predicate.cc b/examples/cpp_api/query_add_predicate.cc index a640974044d..50eae669b7a 100644 --- a/examples/cpp_api/query_add_predicate.cc +++ b/examples/cpp_api/query_add_predicate.cc @@ -228,6 +228,12 @@ void read_array_with_predicates( std::vector e_keys(reserve_cells); std::vector e_validity(reserve_cells); + // reserve additional space so we can push a trailing offset + // to make the printing logic more straightforward + // (this should not be necessary but without this the `push_back` + // flags -Werror=array-bounds in some compilers) + b_data_offsets.reserve(reserve_cells + 1); + // Execute the read query. Array array(ctx, array_name, TILEDB_READ); Query query(ctx, array); From 763a3e2cd8cabb5de191accc9837f096bec369bb Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Wed, 2 Jul 2025 10:11:54 -0400 Subject: [PATCH 27/52] Query add predicate with query condition --- test/src/unit-query-add-predicate.cc | 246 ++++++++++++++++++++------- 1 file changed, 183 insertions(+), 63 deletions(-) diff --git a/test/src/unit-query-add-predicate.cc b/test/src/unit-query-add-predicate.cc index 6c9e93f83bc..ea9429af641 100644 --- a/test/src/unit-query-add-predicate.cc +++ b/test/src/unit-query-add-predicate.cc @@ -62,13 +62,19 @@ using Cells = templates::Fragment2D< std::vector, std::optional>; +struct QueryArrayKWArgs { + Config config; + std::optional condition; +}; + struct QueryAddPredicateFx { VFSTestSetup vfs_test_setup_; + Context ctx_; static const Cells INPUT; - Context context() const { - return vfs_test_setup_.ctx(); + QueryAddPredicateFx() + : ctx_(vfs_test_setup_.ctx()) { } /** @@ -99,7 +105,7 @@ struct QueryAddPredicateFx { const std::string& path, tiledb_layout_t layout, const std::vector& predicates, - const Config& query_config = Config()); + const QueryArrayKWArgs& kwargs = QueryArrayKWArgs()); }; template @@ -179,22 +185,26 @@ const Cells QueryAddPredicateFx::INPUT = make_cells( 7, std::nullopt}); +const auto matchEnumerationNotSupported = Catch::Matchers::ContainsSubstring( + "QueryCondition: Error evaluating expression: Cannot process field " + "'e': Attributes with enumerations are not supported in text " + "predicates"); + void QueryAddPredicateFx::create_array( const std::string& path, tiledb_array_type_t atype, bool allow_dups) { - auto ctx = context(); + Domain domain(ctx_); + domain.add_dimension(Dimension::create(ctx_, "row", {{1, 4}}, 4)); + domain.add_dimension(Dimension::create(ctx_, "col", {{1, 4}}, 4)); - Domain domain(ctx); - domain.add_dimension(Dimension::create(ctx, "row", {{1, 4}}, 4)); - domain.add_dimension(Dimension::create(ctx, "col", {{1, 4}}, 4)); - - ArraySchema schema(ctx, atype); + ArraySchema schema(ctx_, atype); schema.set_tile_order(TILEDB_ROW_MAJOR); schema.set_cell_order(TILEDB_ROW_MAJOR); schema.set_domain(domain); schema.set_allows_dups(allow_dups); - schema.add_attribute(Attribute::create(ctx, "a").set_nullable(true)); - schema.add_attribute(Attribute::create(ctx, "v")); + schema.add_attribute( + Attribute::create(ctx_, "a").set_nullable(true)); + schema.add_attribute(Attribute::create(ctx_, "v")); // enumerated attribute std::vector us_states = { @@ -207,12 +217,12 @@ void QueryAddPredicateFx::create_array( "connecticut", "etc"}; ArraySchemaExperimental::add_enumeration( - ctx, + ctx_, schema, - Enumeration::create(ctx, std::string("us_states"), us_states)); + Enumeration::create(ctx_, std::string("us_states"), us_states)); { - auto e = Attribute::create(ctx, "e").set_nullable(true); - AttributeExperimental::set_enumeration_name(ctx, e, "us_states"); + auto e = Attribute::create(ctx_, "e").set_nullable(true); + AttributeExperimental::set_enumeration_name(ctx_, e, "us_states"); schema.add_attribute(e); } @@ -221,14 +231,13 @@ void QueryAddPredicateFx::create_array( template void QueryAddPredicateFx::write_array(const std::string& path, const F& input) { - auto ctx = context(); - Array array(ctx, path, TILEDB_WRITE); - Query query(ctx, array); + Array array(ctx_, path, TILEDB_WRITE); + Query query(ctx_, array); auto field_sizes = templates::query::make_field_sizes(const_cast(input)); templates::query::set_fields( - ctx.ptr().get(), + ctx_.ptr().get(), query.ptr().get(), field_sizes, const_cast(input), @@ -237,11 +246,10 @@ void QueryAddPredicateFx::write_array(const std::string& path, const F& input) { } void QueryAddPredicateFx::write_array_dense(const std::string& path) { - auto ctx = context(); - Array array(ctx, path, TILEDB_WRITE); - Query query(ctx, array); + Array array(ctx_, path, TILEDB_WRITE); + Query query(ctx_, array); - Subarray s(ctx, array); + Subarray s(ctx_, array); s.add_range(0, 1, 4); s.add_range(1, 1, 4); query.set_layout(TILEDB_ROW_MAJOR).set_subarray(s); @@ -254,7 +262,7 @@ void QueryAddPredicateFx::write_array_dense(const std::string& path) { auto field_sizes = templates::query::make_field_sizes(cells); templates::query::set_fields( - ctx.ptr().get(), + ctx_.ptr().get(), query.ptr().get(), field_sizes, cells, @@ -268,13 +276,11 @@ F QueryAddPredicateFx::query_array( const std::string& path, tiledb_layout_t layout, const std::vector& predicates, - const Config& config) { - auto ctx = context(); - - Array array(ctx, path, TILEDB_READ); - Query query(ctx, array); + const QueryArrayKWArgs& kwargs) { + Array array(ctx_, path, TILEDB_READ); + Query query(ctx_, array); - query.set_config(config).set_layout(layout); + query.set_config(kwargs.config).set_layout(layout); F out; out.resize(32); @@ -283,18 +289,22 @@ F QueryAddPredicateFx::query_array( templates::query::make_field_sizes(out, out.size()); templates::query::set_fields( - ctx.ptr().get(), + ctx_.ptr().get(), query.ptr().get(), field_sizes, out, array.ptr().get()->array_schema_latest()); for (const std::string& pred : predicates) { - QueryExperimental::add_predicate(ctx, query, pred); + QueryExperimental::add_predicate(ctx_, query, pred); + } + + if (kwargs.condition.has_value()) { + query.set_condition(kwargs.condition.value()); } if (array.schema().array_type() == TILEDB_DENSE) { - Subarray s(ctx, array); + Subarray s(ctx_, array); s.add_range(0, 1, 4); s.add_range(1, 1, 4); query.set_subarray(s); @@ -318,28 +328,26 @@ TEST_CASE_METHOD( create_array(array_name, TILEDB_SPARSE); write_array(array_name); - auto ctx = context(); - SECTION("Non-read query errors") { - Array array(ctx, array_name, TILEDB_WRITE); - Query query(ctx, array); + Array array(ctx_, array_name, TILEDB_WRITE); + Query query(ctx_, array); REQUIRE_THROWS_WITH( - QueryExperimental::add_predicate(ctx, query, {"row BETWEEN 4 AND 7"}), + QueryExperimental::add_predicate(ctx_, query, {"row BETWEEN 4 AND 7"}), Catch::Matchers::ContainsSubstring( "Cannot add query predicate; Operation only applicable to read " "queries")); } SECTION("Read query errors") { - Array array(ctx, array_name, TILEDB_READ); - Query query(ctx, array); + Array array(ctx_, array_name, TILEDB_READ); + Query query(ctx_, array); SECTION("Null") { const auto maybe_err = error_if_any( - ctx.ptr().get(), + ctx_.ptr().get(), tiledb_query_add_predicate( - ctx.ptr().get(), query.ptr().get(), nullptr)); + ctx_.ptr().get(), query.ptr().get(), nullptr)); REQUIRE(maybe_err.has_value()); REQUIRE_THAT( maybe_err.value(), @@ -352,7 +360,7 @@ TEST_CASE_METHOD( // If you dbg! the returned expr it prints `Expr::Column(Column { name: // "row" })` REQUIRE_THROWS_WITH( - QueryExperimental::add_predicate(ctx, query, {"row col"}), + QueryExperimental::add_predicate(ctx_, query, {"row col"}), Catch::Matchers::ContainsSubstring( "Error: Expression does not return a boolean value")); } @@ -360,7 +368,7 @@ TEST_CASE_METHOD( SECTION("Non-expression") { REQUIRE_THROWS_WITH( QueryExperimental::add_predicate( - ctx, query, {"CREATE TABLE foo (id INT)"}), + ctx_, query, {"CREATE TABLE foo (id INT)"}), Catch::Matchers::ContainsSubstring( "Error adding predicate: Parse error: SQL error: " "ParserError(\"Unsupported command in expression\")")); @@ -368,14 +376,14 @@ TEST_CASE_METHOD( SECTION("Not a predicate") { REQUIRE_THROWS_WITH( - QueryExperimental::add_predicate(ctx, query, {"row"}), + QueryExperimental::add_predicate(ctx_, query, {"row"}), Catch::Matchers::ContainsSubstring( "Expression does not return a boolean value")); } SECTION("Schema error") { REQUIRE_THROWS_WITH( - QueryExperimental::add_predicate(ctx, query, {"depth = 3"}), + QueryExperimental::add_predicate(ctx_, query, {"depth = 3"}), Catch::Matchers::ContainsSubstring( "Error adding predicate: Parse error: Schema error: No field " "named depth. Valid fields are row, col, a, v, e.")); @@ -393,13 +401,13 @@ TEST_CASE_METHOD( "file an bug report in our issue tracker"; REQUIRE_THROWS_WITH( QueryExperimental::add_predicate( - ctx, query, {"starts_with(row, '1')"}), + ctx_, query, {"starts_with(row, '1')"}), Catch::Matchers::ContainsSubstring(dferror)); } SECTION("Aggregate") { REQUIRE_THROWS_WITH( - QueryExperimental::add_predicate(ctx, query, {"sum(row) >= 10"}), + QueryExperimental::add_predicate(ctx_, query, {"sum(row) >= 10"}), Catch::Matchers::ContainsSubstring( "Aggregate functions in predicate is not supported")); } @@ -416,10 +424,8 @@ TEST_CASE_METHOD( create_array(array_name, TILEDB_SPARSE); write_array(array_name); - auto ctx = context(); - - Array array(ctx, array_name, TILEDB_READ); - Query query(ctx, array); + Array array(ctx_, array_name, TILEDB_READ); + Query query(ctx_, array); query.set_layout(TILEDB_GLOBAL_ORDER); @@ -430,7 +436,7 @@ TEST_CASE_METHOD( templates::query::make_field_sizes(out, out.size()); templates::query::set_fields( - ctx.ptr().get(), + ctx_.ptr().get(), query.ptr().get(), field_sizes, out, @@ -443,7 +449,7 @@ TEST_CASE_METHOD( "Cannot add query predicate; Adding a predicate to an already " "initialized query is not supported."); REQUIRE_THROWS_WITH( - QueryExperimental::add_predicate(ctx, query, "row = col"), expect_err); + QueryExperimental::add_predicate(ctx_, query, "row = col"), expect_err); } TEST_CASE_METHOD( @@ -493,9 +499,16 @@ TEST_CASE_METHOD( SECTION("Legacy global order") { Config qconf; qconf["sm.query.sparse_global_order.reader"] = "legacy"; + + QueryArrayKWArgs kwargs; + kwargs.config = qconf; + REQUIRE_THROWS_WITH( query_array( - array_name, TILEDB_GLOBAL_ORDER, {"a IS NULL", "row > col"}, qconf), + array_name, + TILEDB_GLOBAL_ORDER, + {"a IS NULL", "row > col"}, + kwargs), match); } } @@ -621,10 +634,7 @@ TEST_CASE_METHOD( // enumeration not supported yet REQUIRE_THROWS_WITH( query_array(array_name, query_order, {"e < 'california'"}), - Catch::Matchers::ContainsSubstring( - "QueryCondition: Error evaluating expression: Cannot process field " - "'e': Attributes with enumerations are not supported in text " - "predicates")); + matchEnumerationNotSupported); } } @@ -759,11 +769,10 @@ TEST_CASE_METHOD( write_array(array_name, INPUT); { - auto ctx = context(); - ArraySchemaEvolution(ctx).drop_attribute("a").array_evolve(array_name); + ArraySchemaEvolution(ctx_).drop_attribute("a").array_evolve(array_name); - ArraySchemaEvolution(ctx) - .add_attribute(Attribute::create(ctx, "a")) + ArraySchemaEvolution(ctx_) + .add_attribute(Attribute::create(ctx_, "a")) .array_evolve(array_name); } @@ -809,3 +818,114 @@ TEST_CASE_METHOD( "LargeUtf8 & Int64")); } } + +TEST_CASE_METHOD( + QueryAddPredicateFx, + "Query add predicate with query condition", + "[query][add_predicate]") { + const auto query_order = TILEDB_GLOBAL_ORDER; + + const std::string array_name = vfs_test_setup_.array_uri( + "test_query_add_predicate_with_query_condition"); + + create_array(array_name, TILEDB_SPARSE); + write_array(array_name); + + const Cells expect_a_is_null = make_cells( + {1, 1, 2, 2, 4}, + {2, 3, 1, 4, 1}, + {std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt}, + {"two", "three", "five", "eight", "thirteen"}, + {4, 7, 7, 0, std::nullopt}); + + const Cells expect_v_starts_with_t = make_cells( + {1, 1, 3, 3, 4}, + {2, 3, 2, 4, 1}, + {std::nullopt, std::nullopt, 6, 4, std::nullopt}, + {"two", "three", "ten", "twelve", "thirteen"}, + {4, 7, std::nullopt, 4, std::nullopt}); + + const Cells expect_e_is_null = make_cells( + {1, 2, 3, 4, 4}, + {4, 3, 2, 1, 4}, + {12, 9, 6, std::nullopt, 0}, + {"four", "seven", "ten", "thirteen", "sixteen"}, + {std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt}); + + const Cells expect_a_is_null_and_v_starts_with_t = make_cells( + {1, 1, 4}, + {2, 3, 1}, + {std::nullopt, std::nullopt, std::nullopt}, + {"two", "three", "thirteen"}, + {4, 7, std::nullopt}); + + [[maybe_unused]] const Cells expect_a_and_e_are_null = + make_cells({4}, {1}, {std::nullopt}, {"thirteen"}, {std::nullopt}); + + SECTION("Same") { + QueryArrayKWArgs kwargs; + kwargs.condition.emplace(ctx_); + kwargs.condition.value().init("a", nullptr, 0, TILEDB_EQ); // `a IS NULL` + + const auto qcresult = query_array(array_name, query_order, {}, kwargs); + CHECK(qcresult == expect_a_is_null); + + const auto predresult = query_array(array_name, query_order, {"a IS NULL"}); + CHECK(predresult == expect_a_is_null); + + const auto andresult = + query_array(array_name, query_order, {"a IS NULL"}, kwargs); + CHECK(andresult == expect_a_is_null); + } + + SECTION("Disjoint") { + QueryArrayKWArgs kwargs; + kwargs.condition.emplace(ctx_); + kwargs.condition.value().init("a", nullptr, 0, TILEDB_EQ); // `a IS NULL` + + const auto qcresult = query_array(array_name, query_order, {}, kwargs); + CHECK(qcresult == expect_a_is_null); + + const auto predresult = + query_array(array_name, query_order, {"starts_with(v, 't')"}); + CHECK(predresult == expect_v_starts_with_t); + + const auto andresult = + query_array(array_name, query_order, {"starts_with(v, 't')"}, kwargs); + CHECK(andresult == expect_a_is_null_and_v_starts_with_t); + } + + SECTION("Enumeration in query condition") { + QueryArrayKWArgs kwargs; + kwargs.condition.emplace(ctx_); + kwargs.condition.value().init("e", nullptr, 0, TILEDB_EQ); // `e IS NULL` + + const auto qcresult = query_array(array_name, query_order, {}, kwargs); + CHECK(qcresult == expect_e_is_null); + + const auto predresult = query_array(array_name, query_order, {"a IS NULL"}); + CHECK(predresult == expect_a_is_null); + + // NB: since we re-write the query condition into datafusion + // it also will not support this + REQUIRE_THROWS_WITH( + query_array(array_name, query_order, {"a IS NULL"}, kwargs), + matchEnumerationNotSupported); + } + + SECTION("Enumeration in predicate") { + QueryArrayKWArgs kwargs; + kwargs.condition.emplace(ctx_); + kwargs.condition.value().init("a", nullptr, 0, TILEDB_EQ); // `a IS NULL` + + const auto qcresult = query_array(array_name, query_order, {}, kwargs); + CHECK(qcresult == expect_a_is_null); + + REQUIRE_THROWS_WITH( + query_array(array_name, query_order, {"e IS NULL"}), + matchEnumerationNotSupported); + REQUIRE_THROWS_WITH( + query_array(array_name, query_order, {"e IS NULL"}, kwargs), + matchEnumerationNotSupported); + } +} From 1ab697f89193a1bbce26c3f2e6ae9c85a89e76fc Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Wed, 2 Jul 2025 13:21:57 -0400 Subject: [PATCH 28/52] Add tests demonstrating field escaping --- test/src/unit-query-add-predicate.cc | 165 ++++++++++++++++++++------- 1 file changed, 126 insertions(+), 39 deletions(-) diff --git a/test/src/unit-query-add-predicate.cc b/test/src/unit-query-add-predicate.cc index ea9429af641..024b873865a 100644 --- a/test/src/unit-query-add-predicate.cc +++ b/test/src/unit-query-add-predicate.cc @@ -185,10 +185,45 @@ const Cells QueryAddPredicateFx::INPUT = make_cells( 7, std::nullopt}); -const auto matchEnumerationNotSupported = Catch::Matchers::ContainsSubstring( - "QueryCondition: Error evaluating expression: Cannot process field " - "'e': Attributes with enumerations are not supported in text " - "predicates"); +const Cells expect_a_is_null = make_cells( + {1, 1, 2, 2, 4}, + {2, 3, 1, 4, 1}, + {std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt}, + {"two", "three", "five", "eight", "thirteen"}, + {4, 7, 7, 0, std::nullopt}); + +const Cells expect_v_starts_with_t = make_cells( + {1, 1, 3, 3, 4}, + {2, 3, 2, 4, 1}, + {std::nullopt, std::nullopt, 6, 4, std::nullopt}, + {"two", "three", "ten", "twelve", "thirteen"}, + {4, 7, std::nullopt, 4, std::nullopt}); + +const Cells expect_e_is_null = make_cells( + {1, 2, 3, 4, 4}, + {4, 3, 2, 1, 4}, + {12, 9, 6, std::nullopt, 0}, + {"four", "seven", "ten", "thirteen", "sixteen"}, + {std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt}); + +const Cells expect_a_is_null_and_v_starts_with_t = make_cells( + {1, 1, 4}, + {2, 3, 1}, + {std::nullopt, std::nullopt, std::nullopt}, + {"two", "three", "thirteen"}, + {4, 7, std::nullopt}); + +[[maybe_unused]] const Cells expect_a_and_e_are_null = + make_cells({4}, {1}, {std::nullopt}, {"thirteen"}, {std::nullopt}); + +auto matchEnumerationNotSupported(std::string enumeration_name = "e") { + return Catch::Matchers::ContainsSubstring( + "QueryCondition: Error evaluating expression: Cannot process field " + "'" + + enumeration_name + + "': Attributes with enumerations are not supported in text " + "predicates"); +} void QueryAddPredicateFx::create_array( const std::string& path, tiledb_array_type_t atype, bool allow_dups) { @@ -634,7 +669,7 @@ TEST_CASE_METHOD( // enumeration not supported yet REQUIRE_THROWS_WITH( query_array(array_name, query_order, {"e < 'california'"}), - matchEnumerationNotSupported); + matchEnumerationNotSupported()); } } @@ -831,37 +866,6 @@ TEST_CASE_METHOD( create_array(array_name, TILEDB_SPARSE); write_array(array_name); - const Cells expect_a_is_null = make_cells( - {1, 1, 2, 2, 4}, - {2, 3, 1, 4, 1}, - {std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt}, - {"two", "three", "five", "eight", "thirteen"}, - {4, 7, 7, 0, std::nullopt}); - - const Cells expect_v_starts_with_t = make_cells( - {1, 1, 3, 3, 4}, - {2, 3, 2, 4, 1}, - {std::nullopt, std::nullopt, 6, 4, std::nullopt}, - {"two", "three", "ten", "twelve", "thirteen"}, - {4, 7, std::nullopt, 4, std::nullopt}); - - const Cells expect_e_is_null = make_cells( - {1, 2, 3, 4, 4}, - {4, 3, 2, 1, 4}, - {12, 9, 6, std::nullopt, 0}, - {"four", "seven", "ten", "thirteen", "sixteen"}, - {std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt}); - - const Cells expect_a_is_null_and_v_starts_with_t = make_cells( - {1, 1, 4}, - {2, 3, 1}, - {std::nullopt, std::nullopt, std::nullopt}, - {"two", "three", "thirteen"}, - {4, 7, std::nullopt}); - - [[maybe_unused]] const Cells expect_a_and_e_are_null = - make_cells({4}, {1}, {std::nullopt}, {"thirteen"}, {std::nullopt}); - SECTION("Same") { QueryArrayKWArgs kwargs; kwargs.condition.emplace(ctx_); @@ -910,7 +914,7 @@ TEST_CASE_METHOD( // it also will not support this REQUIRE_THROWS_WITH( query_array(array_name, query_order, {"a IS NULL"}, kwargs), - matchEnumerationNotSupported); + matchEnumerationNotSupported()); } SECTION("Enumeration in predicate") { @@ -923,9 +927,92 @@ TEST_CASE_METHOD( REQUIRE_THROWS_WITH( query_array(array_name, query_order, {"e IS NULL"}), - matchEnumerationNotSupported); + matchEnumerationNotSupported()); REQUIRE_THROWS_WITH( query_array(array_name, query_order, {"e IS NULL"}, kwargs), - matchEnumerationNotSupported); + matchEnumerationNotSupported()); + } +} + +/** + * Test that field names with special characters can be used by enclosing them + * in quotes + */ +TEST_CASE_METHOD( + QueryAddPredicateFx, + "Query add predicate field name escaping", + "[query][add_predicate]") { + const std::string array_name = + vfs_test_setup_.array_uri("test_query_add_predicate_field_name_escape"); + + create_array(array_name, TILEDB_SPARSE); + + // re-name fields to have special characters in them + // (preserve order/types of attributes so we can continue using INPUT) + { + auto enmr = ArrayExperimental::get_enumeration( + ctx_, Array(ctx_, array_name, TILEDB_READ), "us_states"); + + // first drop the old enumeration due to error adding an attribute trying to + // use it: cannot add an attribute using an enumeration which isn't loaded + ArraySchemaEvolution(ctx_) + .drop_attribute("e") + .drop_enumeration("us_states") + .array_evolve(array_name); + + auto evolve = + ArraySchemaEvolution(ctx_) + .drop_attribute("a") + .drop_attribute("v") + .add_attribute( + Attribute::create(ctx_, "'a'").set_nullable(true)) + .add_attribute(Attribute::create(ctx_, "\"v\"")); + + auto e = Attribute::create(ctx_, "e e").set_nullable(true); + AttributeExperimental::set_enumeration_name(ctx_, e, "us_states"); + + evolve.add_attribute(e).add_enumeration(enmr); + + evolve.array_evolve(array_name); + } + + write_array(array_name); + + const auto query_order = TILEDB_GLOBAL_ORDER; + + SECTION("WHERE 'a' IS NULL") { + const auto result = + query_array(array_name, query_order, {"\"'a'\" IS NULL"}); + CHECK(result == expect_a_is_null); + } + + SECTION("WHERE starts_with(\"v\", 't')") { + const auto result = query_array( + array_name, query_order, {"starts_with(\"\"\"v\"\"\", 't')"}); + CHECK(result == expect_v_starts_with_t); + } + + SECTION("WHERE \"e e\" IS NULL") { + REQUIRE_THROWS_WITH( + query_array(array_name, query_order, {"\"e e\" IS NULL"}), + matchEnumerationNotSupported("e e")); + } + + SECTION("Query condition rewrite") { + QueryArrayKWArgs kwargs; + kwargs.condition.emplace(ctx_); + kwargs.condition.value().init( + "'a'", nullptr, 0, TILEDB_EQ); // `"'a'" IS NULL` + + const auto qcresult = query_array(array_name, query_order, {}, kwargs); + CHECK(qcresult == expect_a_is_null); + + const std::string pred = "starts_with(\"\"\"v\"\"\", 't')"; + + const auto predresult = query_array(array_name, query_order, {pred}); + CHECK(predresult == expect_v_starts_with_t); + + const auto andresult = query_array(array_name, query_order, {pred}, kwargs); + CHECK(andresult == expect_a_is_null_and_v_starts_with_t); } } From 2c05e33281fbb9dd0d2a389d31166ae58c96c7bd Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Thu, 3 Jul 2025 08:18:17 -0400 Subject: [PATCH 29/52] Add some FFI for sm Buffer --- tiledb/oxidize/CMakeLists.txt | 1 + tiledb/oxidize/cxx-interface/build.rs | 1 + tiledb/oxidize/cxx-interface/src/sm/buffer.rs | 29 +++++++++++++++++++ tiledb/oxidize/cxx-interface/src/sm/mod.rs | 1 + tiledb/sm/buffer/buffer.h | 6 ++++ 5 files changed, 38 insertions(+) create mode 100644 tiledb/oxidize/cxx-interface/src/sm/buffer.rs diff --git a/tiledb/oxidize/CMakeLists.txt b/tiledb/oxidize/CMakeLists.txt index 873b434dd89..de9bc32f225 100644 --- a/tiledb/oxidize/CMakeLists.txt +++ b/tiledb/oxidize/CMakeLists.txt @@ -35,6 +35,7 @@ cxxbridge( array_schema.cc common/memory_tracker.rs sm/array_schema/mod.rs + sm/buffer.rs sm/enums/mod.rs sm/query/readers/mod.rs sm/query/ast/mod.rs diff --git a/tiledb/oxidize/cxx-interface/build.rs b/tiledb/oxidize/cxx-interface/build.rs index b4c23c432dd..6c30adca629 100644 --- a/tiledb/oxidize/cxx-interface/build.rs +++ b/tiledb/oxidize/cxx-interface/build.rs @@ -2,6 +2,7 @@ fn main() { let bridge_sources = vec![ "src/common/memory_tracker.rs", "src/sm/array_schema/mod.rs", + "src/sm/buffer.rs", "src/sm/enums/mod.rs", "src/sm/misc/mod.rs", "src/sm/query/ast/mod.rs", diff --git a/tiledb/oxidize/cxx-interface/src/sm/buffer.rs b/tiledb/oxidize/cxx-interface/src/sm/buffer.rs new file mode 100644 index 00000000000..af99bd770d0 --- /dev/null +++ b/tiledb/oxidize/cxx-interface/src/sm/buffer.rs @@ -0,0 +1,29 @@ +#[cxx::bridge] +mod ffi { + #[namespace = "tiledb::sm"] + unsafe extern "C++" { + include!("tiledb/sm/buffer/buffer.h"); + type Buffer; + + fn size(&self) -> u64; + fn offset(&self) -> u64; + + #[cxx_name = "bytes"] + fn as_ptr(&self) -> *const u8; + } +} + +pub use ffi::Buffer; + +impl Buffer { + pub fn as_slice(&self) -> &[u8] { + let ptr = self.as_ptr(); + let ptr = if ptr.is_null() { + assert_eq!(0, self.size()); + std::ptr::NonNull::::dangling().as_ptr() + } else { + ptr + }; + unsafe { std::slice::from_raw_parts(ptr, self.size() as usize) } + } +} diff --git a/tiledb/oxidize/cxx-interface/src/sm/mod.rs b/tiledb/oxidize/cxx-interface/src/sm/mod.rs index 24c91cdf903..b3d86a39d21 100644 --- a/tiledb/oxidize/cxx-interface/src/sm/mod.rs +++ b/tiledb/oxidize/cxx-interface/src/sm/mod.rs @@ -1,4 +1,5 @@ pub mod array_schema; +pub mod buffer; pub mod enums; pub mod misc; pub mod query; diff --git a/tiledb/sm/buffer/buffer.h b/tiledb/sm/buffer/buffer.h index 27048094b2d..cf1c7eb3b25 100644 --- a/tiledb/sm/buffer/buffer.h +++ b/tiledb/sm/buffer/buffer.h @@ -93,6 +93,12 @@ class BufferBase { return static_cast(data_); } + /** Returns the buffer data as bytes (this declaration is seemingly redundant + * but helps with Rust FFI declarations) */ + const uint8_t* bytes() const { + return data_as(); + } + /** * Reads from the local data into the input buffer. * From 666e138597f7ef5fc1331f447f49756ecaed9979 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Thu, 3 Jul 2025 08:18:45 -0400 Subject: [PATCH 30/52] FFI use_enumeration --- tiledb/oxidize/cxx-interface/src/sm/query/ast/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/tiledb/oxidize/cxx-interface/src/sm/query/ast/mod.rs b/tiledb/oxidize/cxx-interface/src/sm/query/ast/mod.rs index 4644715b732..7fcd1eb9999 100644 --- a/tiledb/oxidize/cxx-interface/src/sm/query/ast/mod.rs +++ b/tiledb/oxidize/cxx-interface/src/sm/query/ast/mod.rs @@ -20,6 +20,7 @@ mod ffi { fn get_offsets(&self) -> &ByteVecValue; fn num_children(&self) -> u64; fn get_child(&self, i: u64) -> *const ASTNode; + fn use_enumeration(&self) -> bool; } impl SharedPtr {} From 069162b97e0db0981782dc7c0c27840624f9bf48 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Thu, 3 Jul 2025 08:20:14 -0400 Subject: [PATCH 31/52] Bindings for accessing enumeration contents and locating them in a schema --- .../oxidize/cxx-interface/cc/array_schema.cc | 20 ++++++++ .../oxidize/cxx-interface/cc/array_schema.h | 48 +++++++++++++++++ .../cxx-interface/src/sm/array_schema/mod.rs | 51 ++++++++++++++++++- 3 files changed, 118 insertions(+), 1 deletion(-) diff --git a/tiledb/oxidize/cxx-interface/cc/array_schema.cc b/tiledb/oxidize/cxx-interface/cc/array_schema.cc index d397072bfb7..8b16364b49b 100644 --- a/tiledb/oxidize/cxx-interface/cc/array_schema.cc +++ b/tiledb/oxidize/cxx-interface/cc/array_schema.cc @@ -30,4 +30,24 @@ void set_tile_extent(Dimension& dimension, rust::Slice domain) { } // namespace dimension +namespace array_schema { + +std::unique_ptr> enumerations( + const ArraySchema& schema) { + std::unique_ptr> e( + new std::vector(schema.enumeration_map().size())); + + for (const auto& enmr : schema.enumeration_map()) { + if (enmr.second == nullptr) { + e->push_back(MaybeEnumeration::not_loaded(enmr.first)); + } else { + e->push_back(MaybeEnumeration::loaded(enmr.second)); + } + } + + return e; +} + +} // namespace array_schema + } // namespace tiledb::oxidize::sm diff --git a/tiledb/oxidize/cxx-interface/cc/array_schema.h b/tiledb/oxidize/cxx-interface/cc/array_schema.h index 464d9e877bc..85d52a7589e 100644 --- a/tiledb/oxidize/cxx-interface/cc/array_schema.h +++ b/tiledb/oxidize/cxx-interface/cc/array_schema.h @@ -3,6 +3,7 @@ #include "tiledb/sm/array_schema/attribute.h" #include "tiledb/sm/array_schema/dimension.h" #include "tiledb/sm/array_schema/domain.h" +#include "tiledb/sm/array_schema/enumeration.h" namespace tiledb::oxidize::sm { @@ -28,7 +29,54 @@ void set_tile_extent(Dimension& dimension, rust::Slice domain); } // namespace dimension namespace enumeration { + using ConstEnumeration = const tiledb::sm::Enumeration; + +static inline rust::Slice data_cxx( + const Enumeration& enumeration) { + std::span span = enumeration.data(); + return rust::Slice(span.data(), span.size()); +} + +static inline rust::Slice offsets_cxx( + const Enumeration& enumeration) { + std::span span = enumeration.offsets(); + return rust::Slice(span.data(), span.size()); } +} // namespace enumeration + +namespace array_schema { + +struct MaybeEnumeration { + std::optional> name_; + std::shared_ptr value_; + + static MaybeEnumeration not_loaded(const std::string& enumeration_name) { + return MaybeEnumeration{ + .name_ = std::optional(std::cref(enumeration_name)), .value_ = nullptr}; + } + + static MaybeEnumeration loaded(std::shared_ptr value) { + return MaybeEnumeration{.name_ = std::nullopt, .value_ = value}; + } + + const std::string& name() const { + if (name_.has_value()) { + return name_.value().get(); + } else { + return value_->name(); + } + } + + std::shared_ptr get() const { + return value_; + } +}; + +std::unique_ptr> enumerations( + const ArraySchema& schema); + +} // namespace array_schema + } // namespace tiledb::oxidize::sm diff --git a/tiledb/oxidize/cxx-interface/src/sm/array_schema/mod.rs b/tiledb/oxidize/cxx-interface/src/sm/array_schema/mod.rs index 9a83cd3ed60..4d03d958028 100644 --- a/tiledb/oxidize/cxx-interface/src/sm/array_schema/mod.rs +++ b/tiledb/oxidize/cxx-interface/src/sm/array_schema/mod.rs @@ -77,6 +77,12 @@ mod ffi { #[cxx_name = "type"] fn datatype(&self) -> Datatype; + + #[namespace = "tiledb::oxidize::sm::enumeration"] + fn data_cxx(enumeration: &Enumeration) -> &[u8]; + + #[namespace = "tiledb::oxidize::sm::enumeration"] + fn offsets_cxx(enumeration: &Enumeration) -> &[u8]; } #[namespace = "tiledb::oxidize::sm::enumeration"] @@ -84,6 +90,14 @@ mod ffi { type ConstEnumeration; } + #[namespace = "tiledb::oxidize::sm::array_schema"] + unsafe extern "C++" { + type MaybeEnumeration; + + fn name(&self) -> &CxxString; + fn get(&self) -> SharedPtr; + } + #[namespace = "tiledb::sm"] unsafe extern "C++" { include!("tiledb/sm/array_schema/array_schema.h"); @@ -125,6 +139,9 @@ mod ffi { fn set_cell_order(self: Pin<&mut ArraySchema>, order: Layout); fn set_capacity(self: Pin<&mut ArraySchema>, capacity: u64); fn set_allows_dups(self: Pin<&mut ArraySchema>, allows_dups: bool); + + #[namespace = "tiledb::oxidize::sm::array_schema"] + fn enumerations(schema: &ArraySchema) -> UniquePtr>; } impl SharedPtr {} @@ -145,7 +162,10 @@ use std::str::Utf8Error; use num_traits::ToBytes; -pub use ffi::{ArraySchema, Attribute, ConstAttribute, Datatype, Dimension, Domain, Enumeration}; +pub use ffi::{ + ArraySchema, Attribute, ConstAttribute, Datatype, Dimension, Domain, Enumeration, + MaybeEnumeration, +}; #[derive(Debug)] pub enum CellValNum { @@ -340,6 +360,24 @@ impl Enumeration { // SAFETY: non-zero would have been validated by the ArraySchema CellValNum::from_cxx(cxx).unwrap() } + + pub fn data(&self) -> &[u8] { + ffi::data_cxx(self) + } + + pub fn offsets(&self) -> Option<&[u64]> { + let b = ffi::offsets_cxx(self); + if b.is_empty() { + None + } else { + let (prefix, offsets, suffix) = unsafe { b.align_to::() }; + + assert!(prefix.is_empty()); + assert!(suffix.is_empty()); + + Some(offsets) + } + } } impl ArraySchema { @@ -402,4 +440,15 @@ impl ArraySchema { std::mem::transmute::<_, cxx::SharedPtr>(e) } } + + pub fn enumeration(&self, name: &str) -> cxx::SharedPtr { + cxx::let_cxx_string!(cxxname = name); + self.enumeration_cxx(&cxxname) + } + + /// Returns a list of the enumerations in this schema, each of which + /// may or may not be loaded. + pub fn enumerations(&self) -> cxx::UniquePtr> { + ffi::enumerations(self) + } } From 9758e7f069cae09afe82feae00970c31f9d9fc86 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Thu, 3 Jul 2025 08:27:21 -0400 Subject: [PATCH 32/52] ArrowSchema => ArrowArraySchema, contains dyn ArrowArray for enumeration contents --- tiledb/oxidize/arrow/src/enumeration.rs | 45 ++++++++ tiledb/oxidize/arrow/src/lib.rs | 31 ++++-- tiledb/oxidize/arrow/src/record_batch.rs | 66 ++++++------ tiledb/oxidize/arrow/src/schema.rs | 124 ++++++++++++++++------- tiledb/oxidize/expr/src/lib.rs | 4 +- tiledb/oxidize/expr/src/logical_expr.rs | 2 +- tiledb/oxidize/expr/src/physical_expr.rs | 9 +- tiledb/oxidize/session/src/lib.rs | 2 +- tiledb/sm/query/query_condition.h | 5 +- 9 files changed, 201 insertions(+), 87 deletions(-) create mode 100644 tiledb/oxidize/arrow/src/enumeration.rs diff --git a/tiledb/oxidize/arrow/src/enumeration.rs b/tiledb/oxidize/arrow/src/enumeration.rs new file mode 100644 index 00000000000..d169f948766 --- /dev/null +++ b/tiledb/oxidize/arrow/src/enumeration.rs @@ -0,0 +1,45 @@ +use std::sync::Arc; + +use arrow::array::Array as ArrowArray; +use arrow::datatypes::Field as ArrowField; + +use tiledb_cxx_interface::sm::array_schema::Enumeration; + +use crate::{record_batch, schema}; + +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("Enumeration data type error: {0}")] + DataType(#[from] crate::schema::FieldError), + #[error("Enumeration variants error: {0}")] + Variants(#[from] crate::record_batch::FieldError), +} + +pub unsafe fn array_from_enumeration_ffi( + enumeration: &Enumeration, +) -> Result, Error> { + let a = unsafe { array_from_enumeration(enumeration) }?; + Ok(Box::new(super::ArrowArray(a))) +} + +pub unsafe fn array_from_enumeration( + enumeration: &Enumeration, +) -> Result, Error> { + let field = { + let adt = schema::arrow_datatype(enumeration.datatype(), enumeration.cell_val_num())?; + ArrowField::new("unused", adt, false) + }; + + if let Some(offsets) = enumeration.offsets() { + let (_, offsets, _) = unsafe { + // SAFETY: just a transmutes u64 to u8 which always succeeds + // with no possible alignment issues + offsets.align_to::() + }; + Ok(unsafe { + record_batch::to_arrow_array(&field, offsets, Some(enumeration.data()), None) + }?) + } else { + Ok(unsafe { record_batch::to_arrow_array(&field, enumeration.data(), None, None) }?) + } +} diff --git a/tiledb/oxidize/arrow/src/lib.rs b/tiledb/oxidize/arrow/src/lib.rs index 7f4eaa7e9d6..cb9d2720b97 100644 --- a/tiledb/oxidize/arrow/src/lib.rs +++ b/tiledb/oxidize/arrow/src/lib.rs @@ -3,24 +3,34 @@ pub mod ffi { #[namespace = "tiledb::sm"] extern "C++" { include!("tiledb/sm/array_schema/array_schema.h"); + include!("tiledb/sm/array_schema/enumeration.h"); include!("tiledb/sm/query/readers/result_tile.h"); type ArraySchema = tiledb_cxx_interface::sm::array_schema::ArraySchema; + type Enumeration = tiledb_cxx_interface::sm::array_schema::Enumeration; type ResultTile = tiledb_cxx_interface::sm::query::readers::ResultTile; } #[namespace = "tiledb::oxidize::arrow::schema"] extern "Rust" { - type ArrowSchema; + type ArrowArraySchema; #[cxx_name = "create"] - fn array_schema_create_arrow_schema(schema: &ArraySchema) -> Result>; + fn array_schema_create_arrow_schema(schema: &ArraySchema) -> Result>; #[cxx_name = "project"] fn array_schema_project_arrow_schema( schema: &ArraySchema, select: &Vec, - ) -> Result>; + ) -> Result>; + } + + #[namespace = "tiledb::oxidize::arrow::array"] + extern "Rust" { + type ArrowArray; + + #[cxx_name = "from_enumeration"] + unsafe fn array_from_enumeration_ffi(enumeration: &Enumeration) -> Result>; } #[namespace = "tiledb::oxidize::arrow::record_batch"] @@ -29,28 +39,35 @@ pub mod ffi { #[cxx_name = "create"] unsafe fn result_tile_to_record_batch( - schema: &ArrowSchema, + schema: &ArrowArraySchema, tile: &ResultTile, ) -> Result>; } } +pub mod enumeration; pub mod offsets; pub mod record_batch; pub mod schema; +use std::sync::Arc; + +use enumeration::array_from_enumeration_ffi; use record_batch::{ArrowRecordBatch, to_record_batch as result_tile_to_record_batch}; use schema::{ - ArrowSchema, cxx::project_arrow as array_schema_project_arrow_schema, + ArrowArraySchema, cxx::project_arrow as array_schema_project_arrow_schema, cxx::to_arrow as array_schema_create_arrow_schema, }; +/// Wraps a [dyn ArrowArray] for passing across the FFI boundary. +pub struct ArrowArray(pub Arc); + unsafe impl cxx::ExternType for ArrowRecordBatch { type Id = cxx::type_id!("tiledb::oxidize::arrow::record_batch::ArrowRecordBatch"); type Kind = cxx::kind::Opaque; } -unsafe impl cxx::ExternType for ArrowSchema { - type Id = cxx::type_id!("tiledb::oxidize::arrow::schema::ArrowSchema"); +unsafe impl cxx::ExternType for ArrowArraySchema { + type Id = cxx::type_id!("tiledb::oxidize::arrow::schema::ArrowArraySchema"); type Kind = cxx::kind::Opaque; } diff --git a/tiledb/oxidize/arrow/src/record_batch.rs b/tiledb/oxidize/arrow/src/record_batch.rs index 34e0a6688a0..6f0435f1d9e 100644 --- a/tiledb/oxidize/arrow/src/record_batch.rs +++ b/tiledb/oxidize/arrow/src/record_batch.rs @@ -13,7 +13,6 @@ use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow::datatypes::{self as adt, ArrowPrimitiveType, Field}; use arrow::record_batch::{RecordBatch, RecordBatchOptions}; use tiledb_cxx_interface::sm::query::readers::{ResultTile, TileTuple}; -use tiledb_cxx_interface::sm::tile::Tile; use super::*; use crate::offsets::Error as OffsetsError; @@ -61,11 +60,11 @@ pub struct ArrowRecordBatch { /// long as the returned [RecordBatch] is not used after the [ResultTile] /// is destructed. pub unsafe fn to_record_batch( - schema: &ArrowSchema, + schema: &ArrowArraySchema, tile: &ResultTile, ) -> Result, Error> { let columns = schema - .0 + .schema .fields() .iter() .map(|f| { @@ -92,13 +91,13 @@ pub unsafe fn to_record_batch( .collect::>, _>>()?; // SAFETY: should be clear from iteration - assert_eq!(schema.0.fields().len(), columns.len()); + assert_eq!(schema.schema.fields().len(), columns.len()); // SAFETY: `tile_to_arrow_array` must do this, major internal error if not // which is not recoverable assert!( schema - .0 + .schema .fields() .iter() .zip(columns.iter()) @@ -112,19 +111,19 @@ pub unsafe fn to_record_batch( assert!( columns.iter().all(|c| c.len() as u64 == tile.cell_num()), "Columns do not all have same number of cells: {:?} {:?}", - schema.0.fields(), + schema.schema.fields(), columns.iter().map(|c| c.len()).collect::>() ); // SAFETY: the four asserts above rule out each of the possible error conditions let arrow = if columns.is_empty() { RecordBatch::try_new_with_options( - Arc::clone(&schema.0), + Arc::clone(&schema.schema), columns, &RecordBatchOptions::new().with_row_count(Some(tile.cell_num() as usize)), ) } else { - RecordBatch::try_new(Arc::clone(&schema.0), columns) + RecordBatch::try_new(Arc::clone(&schema.schema), columns) }; let arrow = arrow.expect("Logic error: preconditions for constructing RecordBatch not met"); @@ -148,41 +147,40 @@ unsafe fn tile_to_arrow_array( unsafe { // SAFETY: the caller is responsible that each of the tiles tile out-live // the `Arc` created here. See function docs. - to_arrow_array(f, tile.fixed_tile(), tile.var_tile(), tile.validity_tile()) + to_arrow_array( + f, + tile.fixed_tile().as_slice(), + tile.var_tile().map(|t| t.as_slice()), + tile.validity_tile().map(|t| t.as_slice()), + ) } } /// Returns an [ArrowArray] which contains the same contents as the provided -/// triple of [Tile]s. +/// triple of `&[u8]`s. /// /// If `var.is_some()`, then `fixed` contains the offsets and `var` contains /// the values. Otherwise, `fixed` contains the values. /// -/// The `validity` [Tile] contains one value per cell. +/// The `validity` `&[u8]` contains one value per cell. /// /// # Safety /// /// When possible this function avoids copying data. This means that the -/// returned [ArrowArray] may reference data which lives inside the [Tile]s. +/// returned [ArrowArray] may reference data which lives inside the `&[u8]`s. /// This function is safe to call as long as the returned [ArrowArray] is not -/// used after those [Tile]s are destructed. -unsafe fn to_arrow_array( +/// used after the data which the `&[u8]` borrows are destructed. +pub unsafe fn to_arrow_array( f: &Field, - fixed: &Tile, - var: Option<&Tile>, - validity: Option<&Tile>, + fixed: &[u8], + var: Option<&[u8]>, + validity: Option<&[u8]>, ) -> Result, FieldError> { let null_buffer = if let Some(validity) = validity { if !f.is_nullable() { return Err(FieldError::UnexpectedValidityTile); } - Some( - validity - .as_slice() - .iter() - .map(|v| *v != 0) - .collect::(), - ) + Some(validity.iter().map(|v| *v != 0).collect::()) } else if f.is_nullable() { // NB: this is allowed even for nullable fields, it means that none of // the cells is `NULL`. Note that due to schema evolution the arrow @@ -234,7 +232,7 @@ unsafe fn to_arrow_array( let Some(var_tile) = var else { return Err(FieldError::ExpectedVarTile); }; - let offsets = crate::offsets::try_from_bytes(1, fixed.as_slice())?; + let offsets = crate::offsets::try_from_bytes(1, fixed)?; let values = unsafe { // SAFETY: the caller is responsible that `fixed` out-lives // the `Buffer` created here. See function docs. @@ -289,7 +287,7 @@ unsafe fn to_arrow_array( /// This function is safe to call as long as the returned [PrimitiveArray] /// is not used after the argument [Tile] is destructed. unsafe fn to_primitive_array( - tile: &Tile, + bytes: &[u8], validity: Option, ) -> Result, FieldError> where @@ -297,24 +295,24 @@ where { let values = unsafe { // SAFETY: TODO add comment - to_buffer::(tile) + to_buffer::(bytes) }?; Ok(Arc::new(PrimitiveArray::::new(values, validity)) as Arc) } -/// Returns a [Buffer] which refers to the data contained inside the [Tile]. +/// Returns a [Buffer] which refers to the data contained inside the `&[u8]`. /// /// # Safety /// /// This function is safe to call as long as the returned [Buffer] /// is not used after the argument [Tile] is destructed. -unsafe fn to_buffer(tile: &Tile) -> Result, FieldError> +unsafe fn to_buffer(bytes: &[u8]) -> Result, FieldError> where T: ArrowPrimitiveType, { let (prefix, values, suffix) = { // SAFETY: transmuting u8 to primitive types is safe - unsafe { tile.as_slice().align_to::() } + unsafe { bytes.align_to::() } }; if !(prefix.is_empty() && suffix.is_empty()) { return Err(FieldError::InternalUnalignedValues); @@ -337,15 +335,15 @@ where // 2) there is an implicit lifetime requirement that the Tile must out-live // this Buffer, else we shall suffer use after free // 3) the caller is responsible for upholding that guarantee - unsafe { Buffer::from_custom_allocation(ptr, tile.size() as usize, Arc::new(())) } + unsafe { Buffer::from_custom_allocation(ptr, bytes.len(), Arc::new(())) } } else { Buffer::from_vec(Vec::::new()) }, )) } -/// Returns an [OffsetBuffer] which represents the contents of the [Tile]. -fn to_offsets_buffer(value_field: &Field, tile: &Tile) -> Result, OffsetsError> { +/// Returns an [OffsetBuffer] which represents the contents of the `[u8]`. +fn to_offsets_buffer(value_field: &Field, bytes: &[u8]) -> Result, OffsetsError> { let Some(value_size) = value_field.data_type().primitive_width() else { // SAFETY: all list types have primitive element // FIXME: this is true for schema fields, not generally true, @@ -355,5 +353,5 @@ fn to_offsets_buffer(value_field: &Field, tile: &Tile) -> Result, Utf8Error), #[error("Error in field '{0}': {1}")] FieldError(String, FieldError), + #[error("Error in enumeration '{0}': {1}")] + EnumerationError(String, crate::enumeration::Error), } /// An error converting an [ArraySchema] [Field] to [ArrowField]. @@ -28,26 +31,27 @@ pub enum FieldError { InternalInvalidDatatype(u8), #[error("Internal error: enumeration not found: {0}")] InternalEnumerationNotFound(String), + #[error("Enumeration name is not UTF-8")] + EnumerationNameNotUtf8(Vec, Utf8Error), } -/// Wraps a [Schema] for passing across the FFI boundary. -pub struct ArrowSchema(pub Arc); +pub type Enumerations = HashMap>>; -impl Deref for ArrowSchema { - type Target = Arc; - fn deref(&self) -> &Self::Target { - &self.0 - } +/// Wraps a [Schema] for passing across the FFI boundary. +pub struct ArrowArraySchema { + pub schema: Arc, + pub enumerations: Arc, } pub mod cxx { use super::*; - pub fn to_arrow(array_schema: &ArraySchema) -> Result, Error> { - Ok(Box::new(ArrowSchema(Arc::new(super::project_arrow( - array_schema, - |_: &Field| true, - )?)))) + pub fn to_arrow(array_schema: &ArraySchema) -> Result, Error> { + let (schema, enumerations) = super::project_arrow(array_schema, |_: &Field| true)?; + Ok(Box::new(ArrowArraySchema { + schema: Arc::new(schema), + enumerations: Arc::new(enumerations), + })) } /// Returns a [Schema] which represents the physical field types of @@ -57,38 +61,88 @@ pub mod cxx { pub fn project_arrow( array_schema: &ArraySchema, select: &Vec, - ) -> Result, Error> { - Ok(Box::new(ArrowSchema(Arc::new(super::project_arrow( - array_schema, - |field: &Field| select.iter().any(|s| s.as_str() == field.name_cxx()), - )?)))) + ) -> Result, Error> { + let (schema, enumerations) = super::project_arrow(array_schema, |field: &Field| { + select.iter().any(|s| s.as_str() == field.name_cxx()) + })?; + Ok(Box::new(ArrowArraySchema { + schema: Arc::new(schema), + enumerations: Arc::new(enumerations), + })) } } -pub fn to_arrow(array_schema: &ArraySchema) -> Result { +pub fn to_arrow(array_schema: &ArraySchema) -> Result<(Schema, Enumerations), Error> { project_arrow(array_schema, |_: &Field| true) } /// Returns a [Schema] which represents the physical field types of the selected fields from `array_schema`. -pub fn project_arrow(array_schema: &ArraySchema, select: F) -> Result +pub fn project_arrow( + array_schema: &ArraySchema, + select: F, +) -> Result<(Schema, Enumerations), Error> where F: Fn(&Field) -> bool, { - let fields = array_schema.fields().filter(select).map(|f| { - let field_name = f - .name() - .map_err(|e| Error::NameNotUtf8(f.name_cxx().as_bytes().to_vec(), e))?; - let arrow_type = field_arrow_datatype(array_schema, &f) - .map_err(|e| Error::FieldError(field_name.to_owned(), e))?; - - // NB: fields can always be null due to schema evolution - Ok(ArrowField::new(field_name, arrow_type, true)) - }); - - Ok(Schema { - fields: fields.collect::>()?, - metadata: Default::default(), - }) + let fields = array_schema + .fields() + .filter(select) + .map(|f| { + let field_name = f + .name() + .map_err(|e| Error::NameNotUtf8(f.name_cxx().as_bytes().to_vec(), e))?; + let arrow_type = field_arrow_datatype(array_schema, &f) + .map_err(|e| Error::FieldError(field_name.to_owned(), e))?; + + // NB: fields can always be null due to schema evolution + let arrow = ArrowField::new(field_name, arrow_type, true); + + if let Some(ename) = f.enumeration_name() { + let ename = ename + .map_err(|e| { + let ename_cxx = { + // SAFETY: it's `Some` to get into the block, it still will be + f.enumeration_name_cxx().unwrap() + }; + FieldError::EnumerationNameNotUtf8(ename_cxx.as_bytes().to_vec(), e) + }) + .map_err(|e| Error::FieldError(field_name.to_owned(), e))?; + Ok(arrow.with_metadata(HashMap::from([( + "enumeration".to_owned(), + ename.to_owned(), + )]))) + } else { + Ok(arrow) + } + }) + .collect::>()?; + + let enumerations = fields + .iter() + .filter_map(|f| f.metadata().get("enumeration")) + .unique() + .map(|e| { + let enumeration = array_schema.enumeration(e); + if enumeration.is_null() { + Ok((e.to_owned(), None)) + } else { + let a = unsafe { + // SAFETY: TODO comment + crate::enumeration::array_from_enumeration(&enumeration) + } + .map_err(|err| Error::EnumerationError(e.to_owned(), err))?; + Ok((e.to_owned(), Some(a))) + } + }) + .collect::>()?; + + Ok(( + Schema { + fields, + metadata: Default::default(), + }, + enumerations, + )) } /// Returns an [ArrowDataType] which represents the physical data type of `field`. diff --git a/tiledb/oxidize/expr/src/lib.rs b/tiledb/oxidize/expr/src/lib.rs index 8362ed2f5ae..80acb691734 100644 --- a/tiledb/oxidize/expr/src/lib.rs +++ b/tiledb/oxidize/expr/src/lib.rs @@ -17,7 +17,7 @@ mod ffi { type ArrowRecordBatch = tiledb_arrow::record_batch::ArrowRecordBatch; #[namespace = "tiledb::oxidize::arrow::schema"] - type ArrowSchema = tiledb_arrow::schema::ArrowSchema; + type ArrowArraySchema = tiledb_arrow::schema::ArrowArraySchema; } #[namespace = "tiledb::oxidize::datafusion::logical_expr"] @@ -48,7 +48,7 @@ mod ffi { // see the pdavis 65154 branch #[cxx_name = "create"] fn create_physical_expr( - schema: &ArrowSchema, + schema: &ArrowArraySchema, expr: Box, ) -> Result>; } diff --git a/tiledb/oxidize/expr/src/logical_expr.rs b/tiledb/oxidize/expr/src/logical_expr.rs index 9a04f2928d5..56507f92e9e 100644 --- a/tiledb/oxidize/expr/src/logical_expr.rs +++ b/tiledb/oxidize/expr/src/logical_expr.rs @@ -41,7 +41,7 @@ impl LogicalExpr { // SAFETY: the only error we can get from the above is if the arrow schema // has duplicate names, which will not happen since it was constructed from // an ArraySchema which does not allow duplicate names - DFSchema::try_from(arrow_schema).unwrap() + DFSchema::try_from(arrow_schema.0).unwrap() }; Ok(self.0.get_type(&dfschema)?) diff --git a/tiledb/oxidize/expr/src/physical_expr.rs b/tiledb/oxidize/expr/src/physical_expr.rs index 315f97dbb12..44fdd0bee30 100644 --- a/tiledb/oxidize/expr/src/physical_expr.rs +++ b/tiledb/oxidize/expr/src/physical_expr.rs @@ -2,7 +2,6 @@ //! into DataFusion physical expressions which can be evaluated; //! and definitions for evaluating those physical expressions. -use std::ops::Deref; use std::sync::Arc; use datafusion::common::arrow::datatypes::DataType as ArrowDataType; @@ -12,7 +11,7 @@ use datafusion::execution::context::ExecutionProps; use datafusion::logical_expr::ColumnarValue; use datafusion::physical_plan::PhysicalExpr as DatafusionPhysicalExpr; use tiledb_arrow::record_batch::ArrowRecordBatch; -use tiledb_arrow::schema::ArrowSchema; +use tiledb_arrow::schema::ArrowArraySchema; use tiledb_cxx_interface::sm::enums::Datatype; use crate::LogicalExpr; @@ -55,12 +54,12 @@ impl PhysicalExpr { /// Returns a [PhysicalExpr] which evaluates a [LogicalExpr] for the given `schema`. pub fn create_physical_expr( - schema: &ArrowSchema, + schema: &ArrowArraySchema, expr: Box, ) -> Result, PhysicalExprError> { let dfschema = DFSchema::from_field_specific_qualified_schema( - vec![None; schema.fields.len()], - schema.deref(), + vec![None; schema.schema.fields().len()], + &schema.schema, ) .map_err(PhysicalExprError::Create)?; let dfexpr = diff --git a/tiledb/oxidize/session/src/lib.rs b/tiledb/oxidize/session/src/lib.rs index e4b7f72a28c..cf3c503bda9 100644 --- a/tiledb/oxidize/session/src/lib.rs +++ b/tiledb/oxidize/session/src/lib.rs @@ -84,7 +84,7 @@ impl Session { } fn parse_expr(&self, expr: &str, array_schema: &ArraySchema) -> Result { - let arrow_schema = tiledb_arrow::schema::to_arrow(array_schema)?; + let (arrow_schema, _) = tiledb_arrow::schema::to_arrow(array_schema)?; let df_schema = { // SAFETY: this only errors if the names are not unique, // which they will be because `ArraySchema` requires it diff --git a/tiledb/sm/query/query_condition.h b/tiledb/sm/query/query_condition.h index ac71d438b65..e398b00237c 100644 --- a/tiledb/sm/query/query_condition.h +++ b/tiledb/sm/query/query_condition.h @@ -48,7 +48,7 @@ using namespace tiledb::common; namespace tiledb::oxidize::arrow::schema { -struct ArrowSchema; +struct ArrowArraySchema; } namespace tiledb::oxidize::datafusion::logical_expr { struct LogicalExpr; @@ -422,7 +422,8 @@ class QueryCondition { #ifdef HAVE_RUST /** Datafusion expression evaluation */ struct Datafusion { - using BoxSchema = ::rust::Box; + using BoxSchema = + ::rust::Box; using BoxExpr = ::rust::Box; BoxSchema schema_; From b41d1aab36639c99b5aa51b082205d93f35a41aa Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Thu, 3 Jul 2025 08:34:42 -0400 Subject: [PATCH 33/52] Move definitions to .cc file to avoid multiple definition error --- tiledb/oxidize/cxx-interface/cc/array_schema.cc | 14 ++++++++++++++ tiledb/oxidize/cxx-interface/cc/array_schema.h | 14 +++----------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/tiledb/oxidize/cxx-interface/cc/array_schema.cc b/tiledb/oxidize/cxx-interface/cc/array_schema.cc index 8b16364b49b..861745eed39 100644 --- a/tiledb/oxidize/cxx-interface/cc/array_schema.cc +++ b/tiledb/oxidize/cxx-interface/cc/array_schema.cc @@ -30,6 +30,20 @@ void set_tile_extent(Dimension& dimension, rust::Slice domain) { } // namespace dimension +namespace enumeration { + +rust::Slice data_cxx(const Enumeration& enumeration) { + std::span span = enumeration.data(); + return rust::Slice(span.data(), span.size()); +} + +rust::Slice offsets_cxx(const Enumeration& enumeration) { + std::span span = enumeration.offsets(); + return rust::Slice(span.data(), span.size()); +} + +} // namespace enumeration + namespace array_schema { std::unique_ptr> enumerations( diff --git a/tiledb/oxidize/cxx-interface/cc/array_schema.h b/tiledb/oxidize/cxx-interface/cc/array_schema.h index 85d52a7589e..765438abbe0 100644 --- a/tiledb/oxidize/cxx-interface/cc/array_schema.h +++ b/tiledb/oxidize/cxx-interface/cc/array_schema.h @@ -32,17 +32,9 @@ namespace enumeration { using ConstEnumeration = const tiledb::sm::Enumeration; -static inline rust::Slice data_cxx( - const Enumeration& enumeration) { - std::span span = enumeration.data(); - return rust::Slice(span.data(), span.size()); -} - -static inline rust::Slice offsets_cxx( - const Enumeration& enumeration) { - std::span span = enumeration.offsets(); - return rust::Slice(span.data(), span.size()); -} +rust::Slice data_cxx(const Enumeration& enumeration); + +rust::Slice offsets_cxx(const Enumeration& enumeration); } // namespace enumeration From d78e434cbfd6d4ce01cb37fe8214d904832642fe Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Thu, 3 Jul 2025 10:56:32 -0400 Subject: [PATCH 34/52] Add WhichSchema to distinguish schema for view vs. storage, passes unit query condition tests --- tiledb/oxidize/arrow/src/lib.rs | 15 +++- tiledb/oxidize/arrow/src/schema.rs | 94 +++++++++++++------- tiledb/oxidize/expr/src/lib.rs | 4 + tiledb/oxidize/expr/src/logical_expr.rs | 3 +- tiledb/oxidize/expr/src/query_condition.rs | 32 +++++-- tiledb/oxidize/session/src/lib.rs | 3 +- tiledb/sm/query/query.cc | 8 +- tiledb/sm/query/query_condition.cc | 21 +++-- tiledb/sm/query/query_condition.h | 22 +++-- tiledb/sm/query/test/unit_query_condition.cc | 4 +- 10 files changed, 149 insertions(+), 57 deletions(-) diff --git a/tiledb/oxidize/arrow/src/lib.rs b/tiledb/oxidize/arrow/src/lib.rs index cb9d2720b97..49eca807335 100644 --- a/tiledb/oxidize/arrow/src/lib.rs +++ b/tiledb/oxidize/arrow/src/lib.rs @@ -1,5 +1,14 @@ #[cxx::bridge] pub mod ffi { + /// Indicates how an [ArraySchema] should be translated into an Arrow [Schema]. + /// + /// See `schema` module documentation. + #[namespace = "tiledb::oxidize::arrow::schema"] + enum WhichSchema { + Storage, + View, + } + #[namespace = "tiledb::sm"] extern "C++" { include!("tiledb/sm/array_schema/array_schema.h"); @@ -16,11 +25,15 @@ pub mod ffi { type ArrowArraySchema; #[cxx_name = "create"] - fn array_schema_create_arrow_schema(schema: &ArraySchema) -> Result>; + fn array_schema_create_arrow_schema( + schema: &ArraySchema, + which: WhichSchema, + ) -> Result>; #[cxx_name = "project"] fn array_schema_project_arrow_schema( schema: &ArraySchema, + which: WhichSchema, select: &Vec, ) -> Result>; } diff --git a/tiledb/oxidize/arrow/src/schema.rs b/tiledb/oxidize/arrow/src/schema.rs index 58078f9331d..d0fdbbfaad0 100644 --- a/tiledb/oxidize/arrow/src/schema.rs +++ b/tiledb/oxidize/arrow/src/schema.rs @@ -1,5 +1,24 @@ //! Provides definitions for mapping an [ArraySchema] or the contents of an //! [ArraySchema] onto representative Arrow [Schema]ta or [Field]s. +//! +//! For the most part, the mapping of fields from [ArraySchema] to Arrow [Schema] +//! is not complicated. In some cases there is not an exact datatype match. +//! We resolve this by using the _physical_ Arrow data type when possible. +//! +//! The greater complexity comes from enumerations. Arrow does have a Dictionary +//! type which seems appropriate; however, the Arrow `DictionaryArray` requires +//! that its keys are valid indices into the dictionary values. But TileDB +//! enumerations require the opposite: invalid key values are specifically +//! allowed, with the expectation that a user will attach additional variants +//! later. +//! +//! To address this we will separately consider schemata for the "array storage" +//! and the "array view". The "array storage" schema contains the attribute key type +//! and the "array view" contains the enumeration value type. Like with SQL +//! views we will apply expression rewrites in order to translate between them. +//! +//! As a guideline, the "array storage" schema should be used internally and +//! the "array view" schema should be used for user endpoint APIs. use std::collections::HashMap; use std::str::Utf8Error; use std::sync::Arc; @@ -11,6 +30,8 @@ use itertools::Itertools; use tiledb_cxx_interface::sm::array_schema::{ArraySchema, CellValNum, Field}; use tiledb_cxx_interface::sm::enums::Datatype; +pub use super::ffi::WhichSchema; + /// An error converting [ArraySchema] to [Schema]. #[derive(Debug, thiserror::Error)] pub enum Error { @@ -46,8 +67,11 @@ pub struct ArrowArraySchema { pub mod cxx { use super::*; - pub fn to_arrow(array_schema: &ArraySchema) -> Result, Error> { - let (schema, enumerations) = super::project_arrow(array_schema, |_: &Field| true)?; + pub fn to_arrow( + array_schema: &ArraySchema, + which: WhichSchema, + ) -> Result, Error> { + let (schema, enumerations) = super::project_arrow(array_schema, which, |_: &Field| true)?; Ok(Box::new(ArrowArraySchema { schema: Arc::new(schema), enumerations: Arc::new(enumerations), @@ -60,9 +84,10 @@ pub mod cxx { #[allow(clippy::ptr_arg)] pub fn project_arrow( array_schema: &ArraySchema, + which: WhichSchema, select: &Vec, ) -> Result, Error> { - let (schema, enumerations) = super::project_arrow(array_schema, |field: &Field| { + let (schema, enumerations) = super::project_arrow(array_schema, which, |field: &Field| { select.iter().any(|s| s.as_str() == field.name_cxx()) })?; Ok(Box::new(ArrowArraySchema { @@ -72,13 +97,17 @@ pub mod cxx { } } -pub fn to_arrow(array_schema: &ArraySchema) -> Result<(Schema, Enumerations), Error> { - project_arrow(array_schema, |_: &Field| true) +pub fn to_arrow( + array_schema: &ArraySchema, + which: WhichSchema, +) -> Result<(Schema, Enumerations), Error> { + project_arrow(array_schema, which, |_: &Field| true) } /// Returns a [Schema] which represents the physical field types of the selected fields from `array_schema`. pub fn project_arrow( array_schema: &ArraySchema, + which: WhichSchema, select: F, ) -> Result<(Schema, Enumerations), Error> where @@ -91,7 +120,7 @@ where let field_name = f .name() .map_err(|e| Error::NameNotUtf8(f.name_cxx().as_bytes().to_vec(), e))?; - let arrow_type = field_arrow_datatype(array_schema, &f) + let arrow_type = field_arrow_datatype(array_schema, which, &f) .map_err(|e| Error::FieldError(field_name.to_owned(), e))?; // NB: fields can always be null due to schema evolution @@ -148,34 +177,39 @@ where /// Returns an [ArrowDataType] which represents the physical data type of `field`. pub fn field_arrow_datatype( array_schema: &ArraySchema, + which: WhichSchema, field: &Field, ) -> Result { - if let Some(e_name) = field.enumeration_name_cxx() { - if !array_schema.has_enumeration(e_name) { - return Err(FieldError::InternalEnumerationNotFound( - e_name.to_string_lossy().into_owned(), - )); - } + match which { + WhichSchema::Storage => arrow_datatype(field.datatype(), field.cell_val_num()), + WhichSchema::View => { + let Some(e_name) = field.enumeration_name_cxx() else { + return arrow_datatype(field.datatype(), field.cell_val_num()); + }; + if !array_schema.has_enumeration(e_name) { + return Err(FieldError::InternalEnumerationNotFound( + e_name.to_string_lossy().into_owned(), + )); + } - let enumeration = array_schema.enumeration_cxx(e_name); + let enumeration = array_schema.enumeration_cxx(e_name); - let key_type = arrow_datatype(field.datatype(), field.cell_val_num())?; - let value_type = if let Some(enumeration) = enumeration.as_ref() { - arrow_datatype(enumeration.datatype(), enumeration.cell_val_num())? - } else { - // NB: we don't necessarily want to return an error here - // because the enumeration might not actually be used - // in a predicate. We can return some representation - // which we will check later if it is actually used, - // and return an error then. - ArrowDataType::Null - }; - Ok(ArrowDataType::Dictionary( - Box::new(key_type), - Box::new(value_type), - )) - } else { - arrow_datatype(field.datatype(), field.cell_val_num()) + let value_type = if let Some(enumeration) = enumeration.as_ref() { + arrow_datatype(enumeration.datatype(), enumeration.cell_val_num())? + } else { + // NB: we don't necessarily want to return an error here + // because the enumeration might not actually be used + // in a predicate. We can return some representation + // which we will check later if it is actually used, + // and return an error then. + ArrowDataType::Null + }; + Ok(value_type) + } + invalid => unreachable!( + "Request for invalid schema type with discriminant {}", + invalid.repr + ), } } diff --git a/tiledb/oxidize/expr/src/lib.rs b/tiledb/oxidize/expr/src/lib.rs index 80acb691734..b7f7355e81b 100644 --- a/tiledb/oxidize/expr/src/lib.rs +++ b/tiledb/oxidize/expr/src/lib.rs @@ -18,6 +18,9 @@ mod ffi { #[namespace = "tiledb::oxidize::arrow::schema"] type ArrowArraySchema = tiledb_arrow::schema::ArrowArraySchema; + + #[namespace = "tiledb::oxidize::arrow::schema"] + type WhichSchema = tiledb_arrow::schema::WhichSchema; } #[namespace = "tiledb::oxidize::datafusion::logical_expr"] @@ -32,6 +35,7 @@ mod ffi { #[cxx_name = "create"] fn query_condition_to_logical_expr( schema: &ArraySchema, + which: &WhichSchema, query_condition: &ASTNode, ) -> Result>; diff --git a/tiledb/oxidize/expr/src/logical_expr.rs b/tiledb/oxidize/expr/src/logical_expr.rs index 56507f92e9e..d3f60f56eb8 100644 --- a/tiledb/oxidize/expr/src/logical_expr.rs +++ b/tiledb/oxidize/expr/src/logical_expr.rs @@ -6,6 +6,7 @@ use arrow::datatypes::DataType as ArrowDataType; use datafusion::common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}; use datafusion::common::{Column, DFSchema, DataFusionError, ScalarValue}; use datafusion::logical_expr::{Expr, ExprSchemable}; +use tiledb_arrow::schema::WhichSchema; use tiledb_cxx_interface::sm::array_schema::ArraySchema; #[derive(Debug, thiserror::Error)] @@ -30,7 +31,7 @@ impl LogicalExpr { pub fn output_type(&self, schema: &ArraySchema) -> Result { let cols = self.0.column_refs(); - let arrow_schema = tiledb_arrow::schema::project_arrow(schema, |f| { + let arrow_schema = tiledb_arrow::schema::project_arrow(schema, WhichSchema::View, |f| { let Ok(field_name) = f.name() else { // NB: if the field name is not UTF-8 then it cannot possibly match the column name return false; diff --git a/tiledb/oxidize/expr/src/query_condition.rs b/tiledb/oxidize/expr/src/query_condition.rs index 70101cedd96..7d06bdf6e8b 100644 --- a/tiledb/oxidize/expr/src/query_condition.rs +++ b/tiledb/oxidize/expr/src/query_condition.rs @@ -15,6 +15,7 @@ use datafusion::logical_expr::{BinaryExpr, Expr, Operator}; use itertools::Itertools; use num_traits::FromBytes; use tiledb_arrow::offsets::Error as OffsetsError; +use tiledb_arrow::schema::WhichSchema; use tiledb_cxx_interface::sm::array_schema::{ArraySchema, CellValNum, Field}; use tiledb_cxx_interface::sm::enums::{Datatype, QueryConditionCombinationOp, QueryConditionOp}; use tiledb_cxx_interface::sm::misc::ByteVecValue; @@ -104,6 +105,7 @@ where fn leaf_ast_to_binary_expr( schema: &ArraySchema, + which: WhichSchema, ast: &ASTNode, op: Operator, ) -> Result { @@ -115,6 +117,7 @@ fn leaf_ast_to_binary_expr( fn apply( schema: &ArraySchema, + which: WhichSchema, field: &Field, ast: &ASTNode, operator: Operator, @@ -132,8 +135,8 @@ fn leaf_ast_to_binary_expr( .map(ScalarValue::from) .peekable(); - let expect_datatype = - tiledb_arrow::schema::field_arrow_datatype(schema, field).map_err(|e| { + let expect_datatype = tiledb_arrow::schema::field_arrow_datatype(schema, which, field) + .map_err(|e| { InternalError::SchemaField(field.name_cxx().to_string_lossy().into_owned(), e) })?; @@ -205,7 +208,7 @@ fn leaf_ast_to_binary_expr( apply_physical_type!( value_type, NativeType, - apply::(schema, &field, ast, op), + apply::(schema, which, &field, ast, op), |invalid: Datatype| Err(InternalError::InvalidDatatype(invalid.repr.into()).into()) ) } @@ -367,12 +370,13 @@ fn leaf_ast_to_null_test(schema: &ArraySchema, ast: &ASTNode) -> Result Result { let mut level = query_condition .children() - .map(|ast| to_datafusion_impl(schema, ast)) + .map(|ast| to_datafusion_impl(schema, which, ast)) .collect::, _>>()?; while level.len() != 1 { @@ -402,21 +406,25 @@ fn combination_ast_to_binary_expr( Ok(level.into_iter().next().unwrap()) } -fn to_datafusion_impl(schema: &ArraySchema, query_condition: &ASTNode) -> Result { +fn to_datafusion_impl( + schema: &ArraySchema, + which: WhichSchema, + query_condition: &ASTNode, +) -> Result { if query_condition.is_expr() { match *query_condition.get_combination_op() { QueryConditionCombinationOp::AND => { - combination_ast_to_binary_expr(schema, query_condition, Operator::And) + combination_ast_to_binary_expr(schema, which, query_condition, Operator::And) } QueryConditionCombinationOp::OR => { - combination_ast_to_binary_expr(schema, query_condition, Operator::Or) + combination_ast_to_binary_expr(schema, which, query_condition, Operator::Or) } QueryConditionCombinationOp::NOT => { let children = query_condition.children().collect::>(); if children.len() != 1 { return Err(InternalError::NotTree(children.len()).into()); } - let negate_arg = to_datafusion_impl(schema, children[0])?; + let negate_arg = to_datafusion_impl(schema, which, children[0])?; Ok(!negate_arg) } invalid => Err(InternalError::InvalidCombinationOp(invalid.repr.into()).into()), @@ -427,31 +435,37 @@ fn to_datafusion_impl(schema: &ArraySchema, query_condition: &ASTNode) -> Result match *query_condition.get_op() { QueryConditionOp::LT => Ok(leaf_ast_to_binary_expr( schema, + which, query_condition, Operator::Lt, )?), QueryConditionOp::LE => Ok(leaf_ast_to_binary_expr( schema, + which, query_condition, Operator::LtEq, )?), QueryConditionOp::GT => Ok(leaf_ast_to_binary_expr( schema, + which, query_condition, Operator::Gt, )?), QueryConditionOp::GE => Ok(leaf_ast_to_binary_expr( schema, + which, query_condition, Operator::GtEq, )?), QueryConditionOp::EQ => Ok(leaf_ast_to_binary_expr( schema, + which, query_condition, Operator::Eq, )?), QueryConditionOp::NE => Ok(leaf_ast_to_binary_expr( schema, + which, query_condition, Operator::NotEq, )?), @@ -489,10 +503,12 @@ fn to_datafusion_impl(schema: &ArraySchema, query_condition: &ASTNode) -> Result /// as the requested query condition. pub fn to_datafusion( schema: &ArraySchema, + which: &WhichSchema, query_condition: &ASTNode, ) -> Result, Error> { Ok(Box::new(LogicalExpr(to_datafusion_impl( schema, + *which, query_condition, )?))) } diff --git a/tiledb/oxidize/session/src/lib.rs b/tiledb/oxidize/session/src/lib.rs index cf3c503bda9..d3d0a454584 100644 --- a/tiledb/oxidize/session/src/lib.rs +++ b/tiledb/oxidize/session/src/lib.rs @@ -84,7 +84,8 @@ impl Session { } fn parse_expr(&self, expr: &str, array_schema: &ArraySchema) -> Result { - let (arrow_schema, _) = tiledb_arrow::schema::to_arrow(array_schema)?; + let (arrow_schema, _) = + tiledb_arrow::schema::to_arrow(array_schema, tiledb_arrow::schema::WhichSchema::View)?; let df_schema = { // SAFETY: this only errors if the names are not unique, // which they will be because `ArraySchema` requires it diff --git a/tiledb/sm/query/query.cc b/tiledb/sm/query/query.cc index 3a8a1ce3caf..0106512d6b9 100644 --- a/tiledb/sm/query/query.cc +++ b/tiledb/sm/query/query.cc @@ -60,6 +60,7 @@ #include "tiledb/sm/tile/writer_tile_tuple.h" #ifdef HAVE_RUST +#include "tiledb/oxidize/arrow.h" #include "tiledb/oxidize/expr.h" #include "tiledb/oxidize/session.h" #endif @@ -734,7 +735,9 @@ void Query::init() { try { // treat existing query condition (if any) as datafusion if (condition_.has_value()) { - predicates_.push_back(condition_->as_datafusion(array_schema())); + predicates_.push_back(condition_->as_datafusion( + array_schema(), + tiledb::oxidize::arrow::schema::WhichSchema::View)); condition_.reset(); } @@ -877,7 +880,8 @@ Status Query::process() { #ifdef HAVE_RUST auto timer_se = stats_->start_timer("query_condition_rewrite_to_datafusion"); - condition_->rewrite_to_datafusion(array_schema()); + condition_->rewrite_to_datafusion( + array_schema(), tiledb::oxidize::arrow::schema::WhichSchema::Storage); #else std::stringstream ss; ss << "Invalid value for parameter '" << evaluator_param_name diff --git a/tiledb/sm/query/query_condition.cc b/tiledb/sm/query/query_condition.cc index e7dab0c6391..66064b2367a 100644 --- a/tiledb/sm/query/query_condition.cc +++ b/tiledb/sm/query/query_condition.cc @@ -114,7 +114,10 @@ QueryCondition::QueryCondition( field_names_.insert(std::string(c.data(), c.size())); } - datafusion_.emplace(array_schema, std::move(expr)); + datafusion_.emplace( + array_schema, + tiledb::oxidize::arrow::schema::WhichSchema::View, + std::move(expr)); } #endif @@ -182,15 +185,20 @@ void QueryCondition::rewrite_for_schema(const ArraySchema& array_schema) { #ifdef HAVE_RUST rust::Box -QueryCondition::as_datafusion(const ArraySchema& array_schema) { +QueryCondition::as_datafusion( + const ArraySchema& array_schema, + tiledb::oxidize::arrow::schema::WhichSchema which) { return tiledb::oxidize::datafusion::logical_expr::create( - array_schema, *tree_.get()); + array_schema, which, *tree_.get()); } -bool QueryCondition::rewrite_to_datafusion(const ArraySchema& array_schema) { +bool QueryCondition::rewrite_to_datafusion( + const ArraySchema& array_schema, + tiledb::oxidize::arrow::schema::WhichSchema which) { if (!datafusion_.has_value()) { try { - datafusion_.emplace(array_schema, as_datafusion(array_schema)); + datafusion_.emplace( + array_schema, which, as_datafusion(array_schema, which)); } catch (const ::rust::Error& e) { throw QueryConditionException( "Error compiling expression: " + std::string(e.what())); @@ -2987,9 +2995,10 @@ uint64_t QueryCondition::condition_index() const { #ifdef HAVE_RUST QueryCondition::Datafusion::Datafusion( const ArraySchema& array_schema, + tiledb::oxidize::arrow::schema::WhichSchema which, rust::Box&& expr) : schema_(tiledb::oxidize::arrow::schema::project( - array_schema, expr->columns())) + array_schema, which, expr->columns())) , expr_(tiledb::oxidize::datafusion::physical_expr::create( *schema_, std::move(expr))) { } diff --git a/tiledb/sm/query/query_condition.h b/tiledb/sm/query/query_condition.h index e398b00237c..6e210cdbc54 100644 --- a/tiledb/sm/query/query_condition.h +++ b/tiledb/sm/query/query_condition.h @@ -49,7 +49,8 @@ using namespace tiledb::common; namespace tiledb::oxidize::arrow::schema { struct ArrowArraySchema; -} +enum class WhichSchema : uint8_t; +} // namespace tiledb::oxidize::arrow::schema namespace tiledb::oxidize::datafusion::logical_expr { struct LogicalExpr; } @@ -212,21 +213,27 @@ class QueryCondition { * If desired and possible, rewrite the query condition to use Datafusion to * evaluate. * - * Note that this is basically for testing, this isn't expected to be a - * production feature - we will have other entry points for Datafusion which - * make more sense. Datafusion evaluation appears to be slightly slower, which - * makes some sense since we must create arrow and datafusion data structures. + * This is principally used for testing, but may also be called from + * production if a query has both a query condition and a datafusion predicate + * added. + * + * @param array_schema + * @param which The manner of interpreting the array_schema into Arrow * * @return true if a rewrite occurred, false otherwise */ - bool rewrite_to_datafusion(const ArraySchema& array_schema); + bool rewrite_to_datafusion( + const ArraySchema& array_schema, + tiledb::oxidize::arrow::schema::WhichSchema which); /** * @return an equivalent representation of this condition's expression tree as * a Datafusion logical expression */ rust::Box - as_datafusion(const ArraySchema& array_schema); + as_datafusion( + const ArraySchema& array_schema, + tiledb::oxidize::arrow::schema::WhichSchema which); #endif /** @@ -436,6 +443,7 @@ class QueryCondition { Datafusion( const ArraySchema& array_schema, + tiledb::oxidize::arrow::schema::WhichSchema which, rust::Box&& expr); diff --git a/tiledb/sm/query/test/unit_query_condition.cc b/tiledb/sm/query/test/unit_query_condition.cc index bfc2acf66f3..df10109ab27 100644 --- a/tiledb/sm/query/test/unit_query_condition.cc +++ b/tiledb/sm/query/test/unit_query_condition.cc @@ -50,6 +50,7 @@ #ifdef HAVE_RUST #include "test/support/assert_helpers.h" +#include "tiledb/oxidize/arrow.h" #include "tiledb/oxidize/unit_query_condition.h" #endif @@ -5194,7 +5195,8 @@ std::vector instance( // set up datafusion evaluation QueryCondition qc_datafusion(ast.clone()); qc_datafusion.rewrite_for_schema(array_schema); - const bool datafusion_ok = qc_datafusion.rewrite_to_datafusion(array_schema); + const bool datafusion_ok = qc_datafusion.rewrite_to_datafusion( + array_schema, tiledb::oxidize::arrow::schema::WhichSchema::Storage); ASSERTER(datafusion_ok); // prepare to evaluate From 860d3d5fbb64d3f56549a272291e3810f60f381c Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Thu, 3 Jul 2025 11:16:31 -0400 Subject: [PATCH 35/52] Fix wrong write size in unit_query_condition.cc --- tiledb/sm/query/test/unit_query_condition.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tiledb/sm/query/test/unit_query_condition.cc b/tiledb/sm/query/test/unit_query_condition.cc index df10109ab27..26b57a00892 100644 --- a/tiledb/sm/query/test/unit_query_condition.cc +++ b/tiledb/sm/query/test/unit_query_condition.cc @@ -5348,7 +5348,7 @@ TEST_CASE("QueryCondition: Apache DataFusion evaluation", "[QueryCondition]") { tile.tile_tuple("v")->fixed_tile().write( offsets_v.data(), 0, offsets_v.size() * sizeof(uint64_t)); - tile.tile_tuple("v")->var_tile().write(&values_v[0], 0, sizeof(values_v)); + tile.tile_tuple("v")->var_tile().write(&values_v[0], 0, values_v.size()); tile.tile_tuple("v")->validity_tile().write( validity_v.data(), 0, validity_v.size() * sizeof(uint8_t)); From 9d8e4ffe47b9467c1a404cca26bb61e22b7b4f0e Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Thu, 3 Jul 2025 12:32:29 -0400 Subject: [PATCH 36/52] Fix UTF-8, unit_query_condition passes --- .../cxx-interface/src/sm/array_schema/mod.rs | 4 + tiledb/oxidize/expr/src/query_condition.rs | 88 ++++++++++++++----- .../unit-query-condition/src/lib.rs | 41 +++++++-- 3 files changed, 102 insertions(+), 31 deletions(-) diff --git a/tiledb/oxidize/cxx-interface/src/sm/array_schema/mod.rs b/tiledb/oxidize/cxx-interface/src/sm/array_schema/mod.rs index 4d03d958028..0637b2c9574 100644 --- a/tiledb/oxidize/cxx-interface/src/sm/array_schema/mod.rs +++ b/tiledb/oxidize/cxx-interface/src/sm/array_schema/mod.rs @@ -185,6 +185,10 @@ impl CellValNum { n => Some(Self::Fixed(NonZeroU32::new(n)?)), } } + + pub fn is_var(&self) -> bool { + matches!(self, CellValNum::Var) + } } impl Display for CellValNum { diff --git a/tiledb/oxidize/expr/src/query_condition.rs b/tiledb/oxidize/expr/src/query_condition.rs index 7d06bdf6e8b..f20b01e5270 100644 --- a/tiledb/oxidize/expr/src/query_condition.rs +++ b/tiledb/oxidize/expr/src/query_condition.rs @@ -8,7 +8,7 @@ use datafusion::common::arrow::array::{ self as aa, Array as ArrowArray, ArrayData, FixedSizeListArray, GenericListArray, }; use datafusion::common::arrow::buffer::OffsetBuffer; -use datafusion::common::arrow::datatypes::Field as ArrowField; +use datafusion::common::arrow::datatypes::{DataType as ArrowDataType, Field as ArrowField}; use datafusion::common::{Column, ScalarValue}; use datafusion::logical_expr::expr::InList; use datafusion::logical_expr::{BinaryExpr, Expr, Operator}; @@ -69,6 +69,8 @@ pub enum UserError { InListCellValNumMismatch(CellValNum, usize), #[error("Variable-length data offsets: ")] InListVarOffsets(#[from] OffsetsError), + #[error("Invalid query condition operand: {0}")] + ExpectedUtf8(#[source] std::string::FromUtf8Error), } /// Returns an iterator over the values of type [T] contained in `bytes`. @@ -177,23 +179,31 @@ fn leaf_ast_to_binary_expr( ) } CellValNum::Var => { - let values = if values.peek().is_none() { - aa::make_array(ArrayData::new_empty(&expect_datatype)) + if matches!(expect_datatype, ArrowDataType::LargeUtf8) { + ScalarValue::LargeUtf8(Some( + String::from_utf8(ast.get_data().as_slice().to_vec()) + .map_err(UserError::ExpectedUtf8)?, + )) } else { - // SAFETY: `values` produces a static type, so all will match. - // `values` is also non-empty per `peek`. - ScalarValue::iter_to_array(values).unwrap() - }; - let element_field = ArrowField::new_list_field(values.data_type().clone(), false); - ScalarValue::LargeList( - GenericListArray::::new( - element_field.into(), - OffsetBuffer::::from_lengths(std::iter::once(values.len())), - values, - None, + let values = if values.peek().is_none() { + aa::make_array(ArrayData::new_empty(&expect_datatype)) + } else { + // SAFETY: `values` produces a static type, so all will match. + // `values` is also non-empty per `peek`. + ScalarValue::iter_to_array(values).unwrap() + }; + let element_field = + ArrowField::new_list_field(values.data_type().clone(), false); + ScalarValue::LargeList( + GenericListArray::::new( + element_field.into(), + OffsetBuffer::::from_lengths(std::iter::once(values.len())), + values, + None, + ) + .into(), ) - .into(), - ) + } } }; @@ -325,13 +335,45 @@ fn leaf_ast_to_in_list(schema: &ArraySchema, ast: &ASTNode, negated: bool) -> Re })) } - let value_type = field.datatype(); - apply_physical_type!( - value_type, - NativeType, - apply::(&field, ast, negated), - |invalid: Datatype| Err(InternalError::InvalidDatatype(invalid.repr.into()).into()) - ) + if matches!( + field.datatype(), + Datatype::STRING_ASCII | Datatype::STRING_UTF8 + ) && field.cell_val_num().is_var() + { + let array_offsets = tiledb_arrow::offsets::try_from_bytes_and_num_values( + field.datatype().value_size(), + ast.get_offsets().as_slice(), + ast.get_data().len(), + ) + .map_err(UserError::from)?; + + let column = Expr::Column(Column::from_name( + field.name().map_err(UserError::FieldNameNotUtf8)?, + )); + let in_list = array_offsets + .windows(2) + .map(|w| { + let elts = ast.get_data().as_slice()[w[0] as usize..w[1] as usize].to_vec(); + String::from_utf8(elts).map_err(UserError::ExpectedUtf8) + }) + .map_ok(|s| ScalarValue::LargeUtf8(Some(s))) + .map_ok(Expr::Literal) + .collect::, _>>()?; + + Ok(Expr::InList(InList { + expr: Box::new(column), + list: in_list, + negated, + })) + } else { + let value_type = field.datatype(); + apply_physical_type!( + value_type, + NativeType, + apply::(&field, ast, negated), + |invalid: Datatype| Err(InternalError::InvalidDatatype(invalid.repr.into()).into()) + ) + } } fn leaf_ast_to_null_test(schema: &ArraySchema, ast: &ASTNode) -> Result { diff --git a/tiledb/oxidize/staticlibs/unit-query-condition/src/lib.rs b/tiledb/oxidize/staticlibs/unit-query-condition/src/lib.rs index b98aaed440c..af1b74781be 100644 --- a/tiledb/oxidize/staticlibs/unit-query-condition/src/lib.rs +++ b/tiledb/oxidize/staticlibs/unit-query-condition/src/lib.rs @@ -52,8 +52,8 @@ use tiledb_common::query::condition::QueryConditionExpr; use tiledb_common::query::condition::strategy::Parameters as QueryConditionParameters; use tiledb_pod::array::schema::SchemaData; use tiledb_pod::array::schema::strategy::Requirements as SchemaRequirements; -use tiledb_test_cells::Cells; use tiledb_test_cells::strategy::{CellsParameters, CellsStrategySchema, SchemaWithDomain}; +use tiledb_test_cells::{Cells, FieldData}; fn instance_query_condition_datafusion( schema: &SchemaData, @@ -279,6 +279,33 @@ fn examples_query_condition_datafusion_impl() -> anyhow::Result { Ok(true) } +fn cells_ensure_utf8(schema: &SchemaData, cells: Cells) -> Cells { + let mut new_fields = cells.fields().clone(); + for (fname, fdata) in new_fields.iter_mut() { + let Some(field) = schema.field(fname.clone()) else { + continue; + }; + + use tiledb_common::array::CellValNum; + use tiledb_common::datatype::Datatype; + + if matches!(field.cell_val_num(), Some(CellValNum::Var)) + && matches!( + field.datatype(), + Datatype::StringAscii | Datatype::StringUtf8 + ) + { + let FieldData::VecUInt8(strs) = fdata else { + continue; + }; + strs.iter_mut().for_each(|s| { + *s = String::from_utf8_lossy(s).into_owned().into_bytes(); + }); + } + } + Cells::new(new_fields) +} + /// Returns a [Strategy] which produces inputs to `instance_query_condition_datafusion`. fn strat_query_condition_datafusion() -> impl Strategy, Rc, Vec)> { @@ -293,23 +320,21 @@ fn strat_query_condition_datafusion() any_with::(schema_params.into()) .prop_flat_map(|schema| { let schema = Rc::new(schema); + let schema_move_into_strat = Rc::clone(&schema); let strat_cells = any_with::(CellsParameters { schema: Some(CellsStrategySchema::WriteSchema(Rc::clone(&schema))), ..Default::default() - }); + }) + .prop_map(move |cells| cells_ensure_utf8(&schema_move_into_strat, cells)); (Just(schema), strat_cells) }) .prop_flat_map(|(schema, cells)| { let cells = Rc::new(cells); - let strat_params = any_with::(QueryConditionParameters { + let strat_qc = any_with::(QueryConditionParameters { domain: Some(Rc::new(SchemaWithDomain::new(Rc::clone(&schema), &cells))), ..Default::default() }); - ( - Just(schema), - Just(cells), - strat_params.prop_map(|qc| vec![qc]), - ) + (Just(schema), Just(cells), strat_qc.prop_map(|qc| vec![qc])) }) } From d6889bd806eeee4e075080fada1d108d19ae0a1d Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Thu, 3 Jul 2025 12:47:38 -0400 Subject: [PATCH 37/52] Stopgap for enumerations in WhichSchema::View --- tiledb/oxidize/arrow/src/record_batch.rs | 9 ++++--- tiledb/oxidize/arrow/src/schema.rs | 30 ++++++++++++++---------- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/tiledb/oxidize/arrow/src/record_batch.rs b/tiledb/oxidize/arrow/src/record_batch.rs index 6f0435f1d9e..6c94435534d 100644 --- a/tiledb/oxidize/arrow/src/record_batch.rs +++ b/tiledb/oxidize/arrow/src/record_batch.rs @@ -261,11 +261,10 @@ pub unsafe fn to_arrow_array( null_buffer, ))) } - DataType::Dictionary(_, _) => { - // NB: we will do this later, - // it will require some refactoring so that we build the enumeration - // ArrowArrays just once for the whole query, in addition to the - // issues with regards to the enumeration being loaded + DataType::Null => { + // NB: see `arrow/src/schema.rs`. + // This represents the value type of an attribute with an enumeration + // which we will implement later in CORE-285. Err(FieldError::EnumerationNotSupported) } _ => { diff --git a/tiledb/oxidize/arrow/src/schema.rs b/tiledb/oxidize/arrow/src/schema.rs index d0fdbbfaad0..ba5ebe28f60 100644 --- a/tiledb/oxidize/arrow/src/schema.rs +++ b/tiledb/oxidize/arrow/src/schema.rs @@ -192,19 +192,23 @@ pub fn field_arrow_datatype( )); } - let enumeration = array_schema.enumeration_cxx(e_name); - - let value_type = if let Some(enumeration) = enumeration.as_ref() { - arrow_datatype(enumeration.datatype(), enumeration.cell_val_num())? - } else { - // NB: we don't necessarily want to return an error here - // because the enumeration might not actually be used - // in a predicate. We can return some representation - // which we will check later if it is actually used, - // and return an error then. - ArrowDataType::Null - }; - Ok(value_type) + // NB: This branch is reached from `session::parse_expr` which requires + // a schema in order to parse the text into logical expression. + // However, we may not have the enumeration loaded, and without + // loading it we don't know the type (since the type is co-located + // in storage with the variants). + // We should not need to load all enumerations (potentially expensive) + // in order to parse text. + // We also should not error here because then nothing can be parsed + // if there are *any* enumerations in the schema. + // We can work around this by adding an intermediate step to analyze + // the SQL expression tree. + // We defer the implementation of this workaround, and other questions + // about enumeration evaluation, to CORE-285 + // + // For now we return a type which can only appear in this way, + // to return an error later. + Ok(ArrowDataType::Null) } invalid => unreachable!( "Request for invalid schema type with discriminant {}", From d5caf2c65d2db6218669f4a121b3e69c3dec4464 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Thu, 3 Jul 2025 13:22:12 -0400 Subject: [PATCH 38/52] clippy --- tiledb/oxidize/arrow/src/enumeration.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tiledb/oxidize/arrow/src/enumeration.rs b/tiledb/oxidize/arrow/src/enumeration.rs index d169f948766..5ff15793a2b 100644 --- a/tiledb/oxidize/arrow/src/enumeration.rs +++ b/tiledb/oxidize/arrow/src/enumeration.rs @@ -15,6 +15,15 @@ pub enum Error { Variants(#[from] crate::record_batch::FieldError), } +/// Returns an [ArrowArray] whose elements are the variants of an [Enumeration]. +/// +/// # Safety +/// +/// When possible this function avoids copying data. This means that the +/// returned [ArrowArray] may reference data which lives inside the [Enumeration]. +/// It is not safe to use the value returned from this function after +/// the [Enumeration] is destructed. The caller must take care to abide this +/// requirement. Otherwise this function is safe to use. pub unsafe fn array_from_enumeration_ffi( enumeration: &Enumeration, ) -> Result, Error> { @@ -22,6 +31,15 @@ pub unsafe fn array_from_enumeration_ffi( Ok(Box::new(super::ArrowArray(a))) } +/// Returns an [ArrowArray] whose elements are the variants of an [Enumeration]. +/// +/// # Safety +/// +/// When possible this function avoids copying data. This means that the +/// returned [ArrowArray] may reference data which lives inside the [Enumeration]. +/// It is not safe to use the value returned from this function after +/// the [Enumeration] is destructed. The caller must take care to abide this +/// requirement. Otherwise this function is safe to use. pub unsafe fn array_from_enumeration( enumeration: &Enumeration, ) -> Result, Error> { From e18ccd704df2ddbaf8f60c28c58abbfed0f9b099 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Mon, 21 Jul 2025 16:10:42 -0400 Subject: [PATCH 39/52] Fix empty dimension tuple --- test/src/unit-query-add-predicate.cc | 9 ++++--- test/support/src/array_templates.h | 40 +++++++++++++++------------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/test/src/unit-query-add-predicate.cc b/test/src/unit-query-add-predicate.cc index 024b873865a..7466066a96e 100644 --- a/test/src/unit-query-add-predicate.cc +++ b/test/src/unit-query-add-predicate.cc @@ -289,14 +289,15 @@ void QueryAddPredicateFx::write_array_dense(const std::string& path) { s.add_range(1, 1, 4); query.set_layout(TILEDB_ROW_MAJOR).set_subarray(s); - templates::Fragment< + using DenseFragment = templates::Fragment< std::optional, std::vector, - std::optional> - cells = {.atts_ = INPUT.atts_}; + std::optional>; + + DenseFragment cells = {.atts_ = INPUT.atts_}; auto field_sizes = templates::query::make_field_sizes(cells); - templates::query::set_fields( + templates::query::set_fields( ctx_.ptr().get(), query.ptr().get(), field_sizes, diff --git a/test/support/src/array_templates.h b/test/support/src/array_templates.h index 639b4268035..49ff233b17e 100644 --- a/test/support/src/array_templates.h +++ b/test/support/src/array_templates.h @@ -1442,24 +1442,28 @@ void set_fields( std::decay_t, std::tuple_size_v>::value(field_cursors); - [&](std::tuple fields) { - query_applicator::set( - ctx, - query, - split_sizes.first, - fields, - dimension_name, - split_cursors.first); - }(fragment.dimensions()); - [&](std::tuple fields) { - query_applicator::set( - ctx, - query, - split_sizes.second, - fields, - attribute_name, - split_cursors.second); - }(fragment.attributes()); + if constexpr (!std::is_same_v>) { + [&](std::tuple fields) { + query_applicator::set( + ctx, + query, + split_sizes.first, + fields, + dimension_name, + split_cursors.first); + }(fragment.dimensions()); + } + if constexpr (!std::is_same_v>) { + [&](std::tuple fields) { + query_applicator::set( + ctx, + query, + split_sizes.second, + fields, + attribute_name, + split_cursors.second); + }(fragment.attributes()); + } } /** From f1d7eb5fafd5f200b49e5209a4b4775876746ce6 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Mon, 21 Jul 2025 12:16:33 -0400 Subject: [PATCH 40/52] make format --- test/support/src/array_templates.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/support/src/array_templates.h b/test/support/src/array_templates.h index 49ff233b17e..5350ffe5b20 100644 --- a/test/support/src/array_templates.h +++ b/test/support/src/array_templates.h @@ -1442,7 +1442,8 @@ void set_fields( std::decay_t, std::tuple_size_v>::value(field_cursors); - if constexpr (!std::is_same_v>) { + if constexpr (!std:: + is_same_v>) { [&](std::tuple fields) { query_applicator::set( ctx, @@ -1453,7 +1454,8 @@ void set_fields( split_cursors.first); }(fragment.dimensions()); } - if constexpr (!std::is_same_v>) { + if constexpr (!std:: + is_same_v>) { [&](std::tuple fields) { query_applicator::set( ctx, From 9046ef18aa855208677266e265affb9627f841ca Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Mon, 21 Jul 2025 12:43:10 -0400 Subject: [PATCH 41/52] RestClientFactory can construct in place --- tiledb/sm/rest/rest_client.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tiledb/sm/rest/rest_client.cc b/tiledb/sm/rest/rest_client.cc index a9339a15871..4dfc7b5b97e 100644 --- a/tiledb/sm/rest/rest_client.cc +++ b/tiledb/sm/rest/rest_client.cc @@ -104,7 +104,7 @@ std::shared_ptr RestClientFactory::make( Logger& logger, shared_ptr&& tracker) { if (factory_override_ == nullptr) { - return tdb::make_shared(HERE(), RestClient(config)); + return tdb::make_shared(HERE(), config); } else { return factory_override_( parent_stats, config, compute_tp, logger, std::move(tracker)); From 33c8a72f6a21811c3bc622df5e7a33bddb6d88bc Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Mon, 21 Jul 2025 12:52:53 -0400 Subject: [PATCH 42/52] Handle TILEDB_RUST=OFF in unit-query-add-predicate.cc --- test/src/unit-query-add-predicate.cc | 65 ++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/test/src/unit-query-add-predicate.cc b/test/src/unit-query-add-predicate.cc index 7466066a96e..2630e47947a 100644 --- a/test/src/unit-query-add-predicate.cc +++ b/test/src/unit-query-add-predicate.cc @@ -48,6 +48,13 @@ #include "tiledb/sm/cpp_api/tiledb" #include "tiledb/sm/cpp_api/tiledb_experimental" +// this API only works if rust is enabled +#ifdef HAVE_RUST +static constexpr bool isAddPredicateEnabled = true; +#else +static constexpr bool isAddPredicateEnabled = false; +#endif + using namespace tiledb; using namespace tiledb::test; @@ -354,10 +361,36 @@ F QueryAddPredicateFx::query_array( return out; } +TEST_CASE_METHOD( + QueryAddPredicateFx, + "Query add predicate TILEDB_RUST=OFF", + "[capi][query][add_predicate]") { + if (isAddPredicateEnabled) { + SKIP("Test for build configuration TILEDB_RUST=OFF only"); + } + + const std::string array_name = + vfs_test_setup_.array_uri("test_query_add_predicate_TILEDB_RUST_OFF"); + + create_array(array_name, TILEDB_SPARSE); + write_array(array_name); + + const auto match = Catch::Matchers::ContainsSubstring( + "Cannot add query predicate: feature requires build " + "configuration '-DTILEDB_RUST=ON'"); + REQUIRE_THROWS_WITH( + query_array(array_name, TILEDB_GLOBAL_ORDER, {"a IS NULL", "row > col"}), + match); +} + TEST_CASE_METHOD( QueryAddPredicateFx, "Query add predicate errors", "[capi][query][add_predicate]") { + if (!isAddPredicateEnabled) { + SKIP("tiledb_query_add_predicate requires -DTILEDB_RUST=ON"); + } + const std::string array_name = vfs_test_setup_.array_uri("test_query_add_predicate_errors"); @@ -454,6 +487,10 @@ TEST_CASE_METHOD( QueryAddPredicateFx, "Query add predicate to in progress query", "[query][add_predicate]") { + if (!isAddPredicateEnabled) { + SKIP("tiledb_query_add_predicate requires -DTILEDB_RUST=ON"); + } + const std::string array_name = vfs_test_setup_.array_uri("test_query_add_predicate_in_progress"); @@ -492,6 +529,10 @@ TEST_CASE_METHOD( QueryAddPredicateFx, "Query add predicate dense array", "[query][add_predicate]") { + if (!isAddPredicateEnabled) { + SKIP("tiledb_query_add_predicate requires -DTILEDB_RUST=ON"); + } + const std::string array_name = vfs_test_setup_.array_uri("test_query_add_predicate_dense"); @@ -510,6 +551,10 @@ TEST_CASE_METHOD( QueryAddPredicateFx, "Query add predicate sparse unsupported query order", "[query][add_predicate]") { + if (!isAddPredicateEnabled) { + SKIP("tiledb_query_add_predicate requires -DTILEDB_RUST=ON"); + } + const std::string array_name = vfs_test_setup_.array_uri("test_query_add_predicate_sparse_unsupported"); @@ -553,6 +598,10 @@ TEST_CASE_METHOD( QueryAddPredicateFx, "Query add predicate sparse global order", "[query][add_predicate]") { + if (!isAddPredicateEnabled) { + SKIP("tiledb_query_add_predicate requires -DTILEDB_RUST=ON"); + } + const std::string array_name = vfs_test_setup_.array_uri("test_query_add_predicate_sparse_global_order"); @@ -678,6 +727,10 @@ TEST_CASE_METHOD( QueryAddPredicateFx, "Query add predicate sparse unordered with dups", "[query][add_predicate]") { + if (!isAddPredicateEnabled) { + SKIP("tiledb_query_add_predicate requires -DTILEDB_RUST=ON"); + } + const std::string array_name = vfs_test_setup_.array_uri( "test_query_add_predicate_sparse_unordered_with_dups"); @@ -798,6 +851,10 @@ TEST_CASE_METHOD( QueryAddPredicateFx, "Query add predicate evolved schema", "[query][add_predicate]") { + if (!isAddPredicateEnabled) { + SKIP("tiledb_query_add_predicate requires -DTILEDB_RUST=ON"); + } + const std::string array_name = vfs_test_setup_.array_uri("test_query_add_predicate_evolution"); @@ -859,6 +916,10 @@ TEST_CASE_METHOD( QueryAddPredicateFx, "Query add predicate with query condition", "[query][add_predicate]") { + if (!isAddPredicateEnabled) { + SKIP("tiledb_query_add_predicate requires -DTILEDB_RUST=ON"); + } + const auto query_order = TILEDB_GLOBAL_ORDER; const std::string array_name = vfs_test_setup_.array_uri( @@ -943,6 +1004,10 @@ TEST_CASE_METHOD( QueryAddPredicateFx, "Query add predicate field name escaping", "[query][add_predicate]") { + if (!isAddPredicateEnabled) { + SKIP("tiledb_query_add_predicate requires -DTILEDB_RUST=ON"); + } + const std::string array_name = vfs_test_setup_.array_uri("test_query_add_predicate_field_name_escape"); From a3e66171e6a3aa3672f7fe0f9d324cde1be18ffe Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Mon, 21 Jul 2025 12:59:21 -0400 Subject: [PATCH 43/52] HeapMemoryLinter ignores oxidize dir --- scripts/linter.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/linter.py b/scripts/linter.py index 8c695599e04..659e7661a3c 100755 --- a/scripts/linter.py +++ b/scripts/linter.py @@ -141,6 +141,12 @@ def accept_path(self, file_name: str) -> bool: path_components = file_name.split(os.sep) if 'test' in path_components or 'test-support' in path_components: return False + + # the Rust/C++ inter-op using Rust's `cxx` crate can only pass values from + # C++ to Rust using std::unique_ptr + if 'oxidize' in path_components: + return False + return path_components[-1] not in heap_memory_ignored_files From 572ba80ca2c9cd24952d8c79849d061b07d8369d Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Wed, 22 Oct 2025 11:40:46 -0400 Subject: [PATCH 44/52] Remove non-experimental version of add_predicate --- tiledb/sm/cpp_api/query.h | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/tiledb/sm/cpp_api/query.h b/tiledb/sm/cpp_api/query.h index 10d943929d9..2e5308180d2 100644 --- a/tiledb/sm/cpp_api/query.h +++ b/tiledb/sm/cpp_api/query.h @@ -249,23 +249,6 @@ class Query { return *this; } - /** - * Adds a predicate. The predicate will be analyzed and evaluated - * in the subarray step, query condition step, or both. - * - * The predicate is parsed as a SQL expression and must evaluate - * to a boolean. - * - * @param predicate a SQL representation of the predicate - * @return Reference to this Query - */ - Query& add_predicate(const std::string& predicate) { - auto& ctx = ctx_.get(); - ctx.handle_error(tiledb_query_add_predicate( - ctx.ptr().get(), query_.get(), predicate.c_str())); - return *this; - } - /** Returns the array of the query. */ const Array& array() { return array_; From 1823e8a5554935c49b832817ad7f7ae279648690 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Wed, 12 Nov 2025 12:52:35 -0500 Subject: [PATCH 45/52] Fix error message grammar --- test/src/unit-query-add-predicate.cc | 2 +- tiledb/sm/query/query.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/src/unit-query-add-predicate.cc b/test/src/unit-query-add-predicate.cc index 8b350978e99..e14d40e4c2a 100644 --- a/test/src/unit-query-add-predicate.cc +++ b/test/src/unit-query-add-predicate.cc @@ -480,7 +480,7 @@ TEST_CASE_METHOD( REQUIRE_THROWS_WITH( QueryExperimental::add_predicate(ctx_, query, {"sum(row) >= 10"}), Catch::Matchers::ContainsSubstring( - "Aggregate functions in predicate is not supported")); + "Aggregate functions in predicates are not supported")); } } } diff --git a/tiledb/sm/query/query.cc b/tiledb/sm/query/query.cc index 36b615bd904..20c59678e09 100644 --- a/tiledb/sm/query/query.cc +++ b/tiledb/sm/query/query.cc @@ -1563,7 +1563,7 @@ Status Query::add_predicate([[maybe_unused]] const char* predicate) { } if (expr->has_aggregate_functions()) { return Status_QueryError( - "Aggregate functions in predicate is not supported"); + "Aggregate functions in predicates are not supported"); } predicates_.push_back(std::move(expr)); } catch (const rust::Error& e) { From c242c167980f3e9667edb0854f572bf883a5bb53 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Wed, 12 Nov 2025 13:11:37 -0500 Subject: [PATCH 46/52] Split no predicate and WHERE TRUE in examples --- examples/c_api/query_add_predicate.c | 22 ++++++++++++++-------- examples/cpp_api/query_add_predicate.cc | 22 ++++++++++++++-------- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/examples/c_api/query_add_predicate.c b/examples/c_api/query_add_predicate.c index 7e6dbe63eba..554c2d1f876 100644 --- a/examples/c_api/query_add_predicate.c +++ b/examples/c_api/query_add_predicate.c @@ -441,33 +441,39 @@ int main() { // EXAMPLES FROM query_condition_sparse.c EXAMPLE - // Printing the entire array. - printf("WHERE TRUE\n"); + // Execute a read query with no predicate which prints the entire array. + printf("NO PREDICATE\n"); RETURN_IF_NOT_OK(read_array_with_predicates(ctx, 0)); printf("\n"); - // Execute a read query with query condition `a = null`. + // Execute a read query with predicate `TRUE`, which filters no cells and + // prints the whole array + printf("WHERE TRUE\n"); + RETURN_IF_NOT_OK(read_array_with_predicate(ctx, "TRUE")); + printf("\n"); + + // Execute a read query with predicate `a = null`. printf("WHERE a IS NULL\n"); RETURN_IF_NOT_OK(read_array_with_predicate(ctx, "a IS NULL")); printf("\n"); - // Execute a read query with query condition `b < "eve"`. + // Execute a read query with predicate `b < "eve"`. printf("WHERE b < 'eve'\n"); RETURN_IF_NOT_OK(read_array_with_predicate(ctx, "b < 'eve'")); printf("\n"); - // Execute a read query with query condition `c >= 1`. + // Execute a read query with predicate `c >= 1`. printf("WHERE c >= 1\n"); RETURN_IF_NOT_OK(read_array_with_predicate(ctx, "c >= 1")); printf("\n"); - // Execute a read query with query condition `3.0f <= d AND d <= 4.0f`. + // Execute a read query with predicate `3.0f <= d AND d <= 4.0f`. printf("WHERE d BETWEEN 3.0 AND 4.0\n"); RETURN_IF_NOT_OK(read_array_with_predicate(ctx, "d BETWEEN 3.0 AND 4.0")); printf("\n"); - // Execute a read query with query condition `3.0f <= d AND d <= 4.0f AND a != - // null AND b < \"eve\"`. + // Execute a read query with predicate `3.0f <= d AND d <= 4.0f AND a != null + // AND b < \"eve\"`. printf("WHERE (d BETWEEN 3.0 AND 4.0) AND a IS NOT NULL AND b < 'eve'\n"); RETURN_IF_NOT_OK(read_array_with_predicates( ctx, 3, "d BETWEEN 3.0 AND 4.0", "a IS NOT NULL", "b < 'eve'")); diff --git a/examples/cpp_api/query_add_predicate.cc b/examples/cpp_api/query_add_predicate.cc index 50eae669b7a..c62f18fd6b1 100644 --- a/examples/cpp_api/query_add_predicate.cc +++ b/examples/cpp_api/query_add_predicate.cc @@ -295,34 +295,40 @@ int main() { // EXAMPLES FROM query_condition_sparse.cc EXAMPLE - // Printing the entire array. - std::cout << "WHERE TRUE" << std::endl; + // Execute a read query with no predicate which prints the entire array. + std::cout << "NO PREDICATE" << std::endl; read_array_with_predicates(ctx, {}); std::cout << std::endl; - // Execute a read query with query condition `a = null`. + // Execute a read query with predicate `TRUE`, which filters no cells and + // prints the whole array + std::cout << "WHERE TRUE" << std::endl; + read_array_with_predicates(ctx, {"TRUE"}); + std::cout << std::endl; + + // Execute a read query with predicate `a = null`. std::cout << "WHERE a IS NULL" << std::endl; read_array_with_predicates(ctx, {"a IS NULL"}); std::cout << std::endl; - // Execute a read query with query condition `b < "eve"`. + // Execute a read query with predicate `b < "eve"`. std::cout << "WHERE b < 'eve'" << std::endl; read_array_with_predicates(ctx, {"b < 'eve'"}); std::cout << std::endl; - // Execute a read query with query condition `c >= 1`. + // Execute a read query with predicate `c >= 1`. std::cout << "WHERE c >= 1" << std::endl; read_array_with_predicates(ctx, {"c >= 1"}); std::cout << std::endl; - // Execute a read query with query condition `3.0f <= d AND d <= 4.0f`. + // Execute a read query with predicate `3.0f <= d AND d <= 4.0f`. std::cout << "WHERE d BETWEEN 3.0 AND 4.0" << std::endl; QueryCondition qc3(ctx); read_array_with_predicates(ctx, {"d BETWEEN 3.0 AND 4.0"}); std::cout << std::endl; - // Execute a read query with query condition `3.0f <= d AND d <= 4.0f AND a != - // null AND b < \"eve\"`. + // Execute a read query with predicate `3.0f <= d AND d <= 4.0f AND a != null + // AND b < \"eve\"`. std::cout << "WHERE d BETWEEN 3.0 AND 4.0 AND a IS NOT NULL AND b < 'eve'" << std::endl; read_array_with_predicates( From d8eb4197f4ac76a8985c2bf394fc44f0882dcb0c Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Wed, 12 Nov 2025 14:32:43 -0500 Subject: [PATCH 47/52] cargo update and remove cxxbridge version pin --- cmake/oxidize.cmake | 5 +- test/src/unit-query-add-predicate.cc | 15 +- tiledb/oxidize/Cargo.lock | 1213 ++++++++++---------- tiledb/oxidize/Cargo.toml | 8 +- tiledb/oxidize/expr/src/logical_expr.rs | 2 +- tiledb/oxidize/expr/src/physical_expr.rs | 2 +- tiledb/oxidize/expr/src/query_condition.rs | 27 +- 7 files changed, 639 insertions(+), 633 deletions(-) diff --git a/cmake/oxidize.cmake b/cmake/oxidize.cmake index b381d2c22f8..cb494875ce5 100644 --- a/cmake/oxidize.cmake +++ b/cmake/oxidize.cmake @@ -152,12 +152,9 @@ if (TILEDB_RUST) set(CARGO_INSTALL_ROOT ${CMAKE_BINARY_DIR}/cargo/install) set(CARGO_INSTALL_BIN ${CARGO_INSTALL_ROOT}/bin) - # pin version of cxxbridge due to https://github.com/dtolnay/cxx/issues/1436 build errors on MacOS - set(CXXBRIDGE_VERSION 1.0.138) - execute_process( COMMAND - ${CARGO} install cxxbridge-cmd --version ${CXXBRIDGE_VERSION} --root ${CARGO_INSTALL_ROOT} + ${CARGO} install cxxbridge-cmd --root ${CARGO_INSTALL_ROOT} ) execute_process( COMMAND diff --git a/test/src/unit-query-add-predicate.cc b/test/src/unit-query-add-predicate.cc index e14d40e4c2a..3fa1f178d7f 100644 --- a/test/src/unit-query-add-predicate.cc +++ b/test/src/unit-query-add-predicate.cc @@ -427,13 +427,12 @@ TEST_CASE_METHOD( } SECTION("Syntax error") { - // FIXME: this smells like a bug in datafusion. - // If you dbg! the returned expr it prints `Expr::Column(Column { name: - // "row" })` + const std::string expect = + "Parse error: SQL error: ParserError(\"Expected: end of expression, " + "found: col at Line: 1, Column: 5\")"; REQUIRE_THROWS_WITH( QueryExperimental::add_predicate(ctx_, query, {"row col"}), - Catch::Matchers::ContainsSubstring( - "Error: Expression does not return a boolean value")); + Catch::Matchers::ContainsSubstring(expect)); } SECTION("Non-expression") { @@ -467,9 +466,9 @@ TEST_CASE_METHOD( const std::string dferror = "Error adding predicate: Type coercion error: Internal error: Expect " "TypeSignatureClass::Native(LogicalType(Native(String), String)) but " - "received NativeType::UInt64, DataType: UInt64.\nThis was likely " - "caused by a bug in DataFusion's code and we would welcome that you " - "file an bug report in our issue tracker"; + "received NativeType::UInt64, DataType: UInt64.\nThis issue was " + "likely caused by a bug in DataFusion's code. Please help us to " + "resolve this by filing a bug report in our issue tracker:"; REQUIRE_THROWS_WITH( QueryExperimental::add_predicate( ctx_, query, {"starts_with(row, '1')"}), diff --git a/tiledb/oxidize/Cargo.lock b/tiledb/oxidize/Cargo.lock index dccd4013900..cc42a117176 100644 --- a/tiledb/oxidize/Cargo.lock +++ b/tiledb/oxidize/Cargo.lock @@ -2,15 +2,6 @@ # It is not intended for manual editing. version = 4 -[[package]] -name = "addr2line" -version = "0.24.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" -dependencies = [ - "gimli", -] - [[package]] name = "adler2" version = "2.0.1" @@ -25,7 +16,7 @@ checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", "const-random", - "getrandom 0.3.3", + "getrandom 0.3.4", "once_cell", "version_check", "zerocopy", @@ -33,9 +24,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.1.3" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" dependencies = [ "memchr", ] @@ -61,12 +52,6 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" -[[package]] -name = "android-tzdata" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" - [[package]] name = "android_system_properties" version = "0.1.5" @@ -78,15 +63,24 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.11" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" [[package]] name = "anyhow" -version = "1.0.98" +version = "1.0.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" + +[[package]] +name = "ar_archive_writer" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" +checksum = "f0c269894b6fe5e9d7ada0cf69b5bf847ff35bc25fc271f08e1d080fce80339a" +dependencies = [ + "object", +] [[package]] name = "arrayref" @@ -102,9 +96,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "55.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1bb018b6960c87fd9d025009820406f74e83281185a8bdcb44880d2aa5c9a87" +checksum = "6e833808ff2d94ed40d9379848a950d995043c7fb3e81a30b383f4c6033821cc" dependencies = [ "arrow-arith", "arrow-array", @@ -123,9 +117,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "55.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44de76b51473aa888ecd6ad93ceb262fb8d40d1f1154a4df2f069b3590aa7575" +checksum = "ad08897b81588f60ba983e3ca39bda2b179bdd84dced378e7df81a5313802ef8" dependencies = [ "arrow-array", "arrow-buffer", @@ -137,9 +131,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "55.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29ed77e22744475a9a53d00026cf8e166fe73cf42d89c4c4ae63607ee1cfcc3f" +checksum = "8548ca7c070d8db9ce7aa43f37393e4bfcf3f2d3681df278490772fd1673d08d" dependencies = [ "ahash", "arrow-buffer", @@ -148,15 +142,15 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.15.4", + "hashbrown 0.16.0", "num", ] [[package]] name = "arrow-buffer" -version = "55.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0391c96eb58bf7389171d1e103112d3fc3e5625ca6b372d606f2688f1ea4cce" +checksum = "e003216336f70446457e280807a73899dd822feaf02087d31febca1363e2fccc" dependencies = [ "bytes", "half", @@ -165,9 +159,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "55.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f39e1d774ece9292697fcbe06b5584401b26bd34be1bec25c33edae65c2420ff" +checksum = "919418a0681298d3a77d1a315f625916cb5678ad0d74b9c60108eb15fd083023" dependencies = [ "arrow-array", "arrow-buffer", @@ -186,9 +180,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "55.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9055c972a07bf12c2a827debfd34f88d3b93da1941d36e1d9fee85eebe38a12a" +checksum = "bfa9bf02705b5cf762b6f764c65f04ae9082c7cfc4e96e0c33548ee3f67012eb" dependencies = [ "arrow-array", "arrow-cast", @@ -196,15 +190,14 @@ dependencies = [ "chrono", "csv", "csv-core", - "lazy_static", "regex", ] [[package]] name = "arrow-data" -version = "55.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf75ac27a08c7f48b88e5c923f267e980f27070147ab74615ad85b5c5f90473d" +checksum = "a5c64fff1d142f833d78897a772f2e5b55b36cb3e6320376f0961ab0db7bd6d0" dependencies = [ "arrow-buffer", "arrow-schema", @@ -214,23 +207,25 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "55.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a222f0d93772bd058d1268f4c28ea421a603d66f7979479048c429292fac7b2e" +checksum = "1d3594dcddccc7f20fd069bc8e9828ce37220372680ff638c5e00dea427d88f5" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", + "arrow-select", "flatbuffers", "lz4_flex", + "zstd", ] [[package]] name = "arrow-json" -version = "55.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9085342bbca0f75e8cb70513c0807cc7351f1fbf5cb98192a67d5e3044acb033" +checksum = "88cf36502b64a127dc659e3b305f1d993a544eab0d48cce704424e62074dc04b" dependencies = [ "arrow-array", "arrow-buffer", @@ -250,9 +245,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "55.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab2f1065a5cad7b9efa9e22ce5747ce826aa3855766755d4904535123ef431e7" +checksum = "3c8f82583eb4f8d84d4ee55fd1cb306720cddead7596edce95b50ee418edf66f" dependencies = [ "arrow-array", "arrow-buffer", @@ -263,9 +258,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "55.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3703a0e3e92d23c3f756df73d2dc9476873f873a76ae63ef9d3de17fda83b2d8" +checksum = "9d07ba24522229d9085031df6b94605e0f4b26e099fb7cdeec37abd941a73753" dependencies = [ "arrow-array", "arrow-buffer", @@ -276,15 +271,19 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "55.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73a47aa0c771b5381de2b7f16998d351a6f4eb839f1e13d48353e17e873d969b" +checksum = "b3aa9e59c611ebc291c28582077ef25c97f1975383f1479b12f3b9ffee2ffabe" +dependencies = [ + "serde", + "serde_json", +] [[package]] name = "arrow-select" -version = "55.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24b7b85575702b23b85272b01bc1c25a01c9b9852305e5d0078c79ba25d995d4" +checksum = "8c41dbbd1e97bfcaee4fcb30e29105fb2c75e4d82ae4de70b792a5d3f66b2e7a" dependencies = [ "ahash", "arrow-array", @@ -296,9 +295,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "55.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9260fddf1cdf2799ace2b4c2fc0356a9789fa7551e0953e35435536fecefebbd" +checksum = "53f5183c150fbc619eede22b861ea7c0eebed8eaac0333eaa7f6da5205fd504d" dependencies = [ "arrow-array", "arrow-buffer", @@ -317,7 +316,7 @@ version = "0.4.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" dependencies = [ - "bzip2", + "bzip2 0.5.2", "flate2", "futures-core", "memchr", @@ -330,9 +329,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.88" +version = "0.1.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", @@ -354,21 +353,6 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" -[[package]] -name = "backtrace" -version = "0.3.75" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6806a6321ec58106fea15becdad98371e28d92ccbc7c8f1b3b6dd724fe8f1002" -dependencies = [ - "addr2line", - "cfg-if", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", - "windows-targets 0.52.6", -] - [[package]] name = "base64" version = "0.22.1" @@ -377,9 +361,9 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] name = "bigdecimal" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a22f228ab7a1b23027ccc6c350b72868017af7ea8356fbdf19f8d991c690013" +checksum = "560f42649de9fa436b73517378a147ec21f6c997a546581df4b4b31677828934" dependencies = [ "autocfg", "libm", @@ -405,9 +389,9 @@ checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" [[package]] name = "bitflags" -version = "2.9.1" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" +checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" [[package]] name = "blake2" @@ -442,9 +426,9 @@ dependencies = [ [[package]] name = "brotli" -version = "8.0.1" +version = "8.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9991eea70ea4f293524138648e41ee89b0b2b12ddef3b255effa43c8056e0e0d" +checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -463,9 +447,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.18.1" +version = "3.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "793db76d6187cd04dff33004d8e6c9cc4e05cd330500379d2394209271b4aeee" +checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" [[package]] name = "byteorder" @@ -488,6 +472,15 @@ dependencies = [ "bzip2-sys", ] +[[package]] +name = "bzip2" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" +dependencies = [ + "libbz2-rs-sys", +] + [[package]] name = "bzip2-sys" version = "0.1.13+1.0.8" @@ -500,10 +493,11 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.27" +version = "1.2.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d487aa071b5f64da6f19a3e848e3578944b726ee5a4854b82172f02aa876bfdc" +checksum = "35900b6c8d709fb1d854671ae27aeaa9eec2f8b01b364e1619a40da3e6fe2afe" dependencies = [ + "find-msvc-tools", "jobserver", "libc", "shlex", @@ -512,7 +506,7 @@ dependencies = [ [[package]] name = "cells" version = "0.1.0" -source = "git+https://github.com/TileDB-Inc/tiledb-rs.git?branch=main#e418936fff551dd608e2a1b5e3c557f4c8e5d29d" +source = "git+https://github.com/TileDB-Inc/tiledb-rs.git?branch=main#1dafdf310ed2e8f4e314a40dff9f3ff46a22c64d" dependencies = [ "paste", "proptest", @@ -524,17 +518,16 @@ dependencies = [ [[package]] name = "cfg-if" -version = "1.0.1" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] name = "chrono" -version = "0.4.41" +version = "0.4.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c469d952047f47f91b68d1cba3f10d63c11d73e4636f24f08daf0278abf01c4d" +checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" dependencies = [ - "android-tzdata", "iana-time-zone", "num-traits", "windows-link", @@ -542,39 +535,28 @@ dependencies = [ [[package]] name = "chrono-tz" -version = "0.10.3" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efdce149c370f133a071ca8ef6ea340b7b88748ab0810097a9e2976eaa34b4f3" +checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" dependencies = [ "chrono", - "chrono-tz-build", "phf", ] -[[package]] -name = "chrono-tz-build" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f10f8c9340e31fc120ff885fcdb54a0b48e474bbd77cab557f0c30a3e569402" -dependencies = [ - "parse-zoneinfo", - "phf_codegen", -] - [[package]] name = "clap" -version = "4.5.40" +version = "4.5.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40b6887a1d8685cebccf115538db5c0efe625ccac9696ad45c409d96566e910f" +checksum = "4c26d721170e0295f191a69bd9a1f93efcdb0aff38684b61ab5750468972e5f5" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.40" +version = "4.5.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0c66c08ce9f0c698cbce5c0279d0bb6ac936d8674174fe48f736533b964f59e" +checksum = "75835f0c7bf681bfd05abe44e965760fea999a5286c6eb2d59883634fd02011a" dependencies = [ "anstyle", "clap_lex", @@ -583,39 +565,30 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" - -[[package]] -name = "codespan-reporting" -version = "0.11.1" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" -dependencies = [ - "termcolor", - "unicode-width 0.1.14", -] +checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "codespan-reporting" -version = "0.12.0" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe6d2e5af09e8c8ad56c969f2157a3d4238cebc7c55f0a517728c38f7b200f81" +checksum = "af491d569909a7e4dee0ad7db7f5341fef5c614d5b8ec8cf765732aba3cff681" dependencies = [ "serde", "termcolor", - "unicode-width 0.2.1", + "unicode-width", ] [[package]] name = "comfy-table" -version = "7.1.4" +version = "7.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a65ebfec4fb190b6f90e944a817d60499ee0744e582530e2c9900a22e591d9a" +checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" dependencies = [ - "unicode-segmentation", - "unicode-width 0.2.1", + "strum", + "strum_macros", + "unicode-width", ] [[package]] @@ -661,9 +634,9 @@ dependencies = [ [[package]] name = "crc32fast" -version = "1.4.2" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" dependencies = [ "cfg-if", ] @@ -676,15 +649,15 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crunchy" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" [[package]] name = "crypto-common" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" dependencies = [ "generic-array", "typenum", @@ -692,47 +665,49 @@ dependencies = [ [[package]] name = "csv" -version = "1.3.1" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" dependencies = [ "csv-core", "itoa", "ryu", - "serde", + "serde_core", ] [[package]] name = "csv-core" -version = "0.1.12" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" dependencies = [ "memchr", ] [[package]] name = "cxx" -version = "1.0.138" +version = "1.0.188" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3956d60afa98653c5a57f60d7056edd513bfe0307ef6fb06f6167400c3884459" +checksum = "47ac4eaf7ebe29e92f1b091ceefec7710a53a6f6154b2460afda626c113b65b9" dependencies = [ "cc", + "cxx-build", "cxxbridge-cmd", "cxxbridge-flags", "cxxbridge-macro", - "foldhash", + "foldhash 0.2.0", "link-cplusplus", ] [[package]] name = "cxx-build" -version = "1.0.158" +version = "1.0.188" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36a8232661d66dcf713394726157d3cfe0a89bfc85f52d6e9f9bbc2306797fe7" +checksum = "2abd4c3021eefbac5149f994c117b426852bca3a0aad227698527bca6d4ea657" dependencies = [ "cc", - "codespan-reporting 0.12.0", + "codespan-reporting", + "indexmap", "proc-macro2", "quote", "scratch", @@ -741,12 +716,13 @@ dependencies = [ [[package]] name = "cxxbridge-cmd" -version = "1.0.138" +version = "1.0.188" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f01e92ab4ce9fd4d16e3bb11b158d98cbdcca803c1417aa43130a6526fbf208" +checksum = "6f12fbc5888b2311f23e52a601e11ad7790d8f0dbb903ec26e2513bf5373ed70" dependencies = [ "clap", - "codespan-reporting 0.11.1", + "codespan-reporting", + "indexmap", "proc-macro2", "quote", "syn", @@ -754,19 +730,19 @@ dependencies = [ [[package]] name = "cxxbridge-flags" -version = "1.0.138" +version = "1.0.188" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c41cbfab344869e70998b388923f7d1266588f56c8ca284abf259b1c1ffc695" +checksum = "83d3dd7870af06e283f3f8ce0418019c96171c9ce122cfb9c8879de3d84388fd" [[package]] name = "cxxbridge-macro" -version = "1.0.138" +version = "1.0.188" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88d82a2f759f0ad3eae43b96604efd42b1d4729a35a6f2dc7bdb797ae25d9284" +checksum = "a26f0d82da663316786791c3d0e9f9edc7d1ee1f04bdad3d2643086a69d6256c" dependencies = [ + "indexmap", "proc-macro2", "quote", - "rustversion", "syn", ] @@ -786,16 +762,16 @@ dependencies = [ [[package]] name = "datafusion" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffe060b978f74ab446be722adb8a274e052e005bf6dfd171caadc3abaad10080" +checksum = "2af15bb3c6ffa33011ef579f6b0bcbe7c26584688bd6c994f548e44df67f011a" dependencies = [ "arrow", "arrow-ipc", "arrow-schema", "async-trait", "bytes", - "bzip2", + "bzip2 0.6.1", "chrono", "datafusion-catalog", "datafusion-catalog-listing", @@ -813,9 +789,9 @@ dependencies = [ "datafusion-functions-nested", "datafusion-functions-table", "datafusion-functions-window", - "datafusion-macros", "datafusion-optimizer", "datafusion-physical-expr", + "datafusion-physical-expr-adapter", "datafusion-physical-expr-common", "datafusion-physical-optimizer", "datafusion-physical-plan", @@ -828,7 +804,7 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand", + "rand 0.9.2", "regex", "sqlparser", "tempfile", @@ -841,9 +817,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61fe34f401bd03724a1f96d12108144f8cd495a3cdda2bf5e091822fb80b7e66" +checksum = "187622262ad8f7d16d3be9202b4c1e0116f1c9aa387e5074245538b755261621" dependencies = [ "arrow", "async-trait", @@ -867,9 +843,9 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4411b8e3bce5e0fc7521e44f201def2e2d5d1b5f176fb56e8cdc9942c890f00" +checksum = "9657314f0a32efd0382b9a46fdeb2d233273ece64baa68a7c45f5a192daf0f83" dependencies = [ "arrow", "async-trait", @@ -890,14 +866,15 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0734015d81c8375eb5d4869b7f7ecccc2ee8d6cb81948ef737cd0e7b743bd69c" +checksum = "5a83760d9a13122d025fbdb1d5d5aaf93dd9ada5e90ea229add92aa30898b2d1" dependencies = [ "ahash", "arrow", "arrow-ipc", "base64", + "chrono", "half", "hashbrown 0.14.5", "indexmap", @@ -914,9 +891,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5167bb1d2ccbb87c6bc36c295274d7a0519b14afcfdaf401d53cbcaa4ef4968b" +checksum = "5b6234a6c7173fe5db1c6c35c01a12b2aa0f803a3007feee53483218817f8b1e" dependencies = [ "futures", "log", @@ -925,21 +902,22 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04e602dcdf2f50c2abf297cc2203c73531e6f48b29516af7695d338cf2a778b1" +checksum = "7256c9cb27a78709dd42d0c80f0178494637209cac6e29d5c93edd09b6721b86" dependencies = [ "arrow", "async-compression", "async-trait", "bytes", - "bzip2", + "bzip2 0.6.1", "chrono", "datafusion-common", "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", "datafusion-physical-expr", + "datafusion-physical-expr-adapter", "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-session", @@ -950,7 +928,7 @@ dependencies = [ "log", "object_store", "parquet", - "rand", + "rand 0.9.2", "tempfile", "tokio", "tokio-util", @@ -961,9 +939,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3bb2253952dc32296ed5b84077cb2e0257fea4be6373e1c376426e17ead4ef6" +checksum = "64533a90f78e1684bfb113d200b540f18f268134622d7c96bbebc91354d04825" dependencies = [ "arrow", "async-trait", @@ -986,9 +964,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b8c7f47a5d2fe03bfa521ec9bafdb8a5c82de8377f60967c3663f00c8790352" +checksum = "8d7ebeb12c77df0aacad26f21b0d033aeede423a64b2b352f53048a75bf1d6e6" dependencies = [ "arrow", "async-trait", @@ -1011,9 +989,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27d15868ea39ed2dc266728b554f6304acd473de2142281ecfa1294bb7415923" +checksum = "09e783c4c7d7faa1199af2df4761c68530634521b176a8d1331ddbc5a5c75133" dependencies = [ "arrow", "async-trait", @@ -1026,9 +1004,11 @@ dependencies = [ "datafusion-expr", "datafusion-functions-aggregate", "datafusion-physical-expr", + "datafusion-physical-expr-adapter", "datafusion-physical-expr-common", "datafusion-physical-optimizer", "datafusion-physical-plan", + "datafusion-pruning", "datafusion-session", "futures", "itertools", @@ -1036,23 +1016,24 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand", + "rand 0.9.2", "tokio", ] [[package]] name = "datafusion-doc" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a91f8c2c5788ef32f48ff56c68e5b545527b744822a284373ac79bba1ba47292" +checksum = "99ee6b1d9a80d13f9deb2291f45c07044b8e62fb540dbde2453a18be17a36429" [[package]] name = "datafusion-execution" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06f004d100f49a3658c9da6fb0c3a9b760062d96cd4ad82ccc3b7b69a9fb2f84" +checksum = "a4cec0a57653bec7b933fb248d3ffa3fa3ab3bd33bd140dc917f714ac036f531" dependencies = [ "arrow", + "async-trait", "dashmap", "datafusion-common", "datafusion-expr", @@ -1060,18 +1041,19 @@ dependencies = [ "log", "object_store", "parking_lot", - "rand", + "rand 0.9.2", "tempfile", "url", ] [[package]] name = "datafusion-expr" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a4e4ce3802609be38eeb607ee72f6fe86c3091460de9dbfae9e18db423b3964" +checksum = "ef76910bdca909722586389156d0aa4da4020e1631994d50fadd8ad4b1aa05fe" dependencies = [ "arrow", + "async-trait", "chrono", "datafusion-common", "datafusion-doc", @@ -1088,9 +1070,9 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "422ac9cf3b22bbbae8cdf8ceb33039107fde1b5492693168f13bd566b1bcc839" +checksum = "6d155ccbda29591ca71a1344dd6bed26c65a4438072b400df9db59447f590bb6" dependencies = [ "arrow", "datafusion-common", @@ -1101,9 +1083,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ddf0a0a2db5d2918349c978d42d80926c6aa2459cd8a3c533a84ec4bb63479e" +checksum = "7de2782136bd6014670fd84fe3b0ca3b3e4106c96403c3ae05c0598577139977" dependencies = [ "arrow", "arrow-buffer", @@ -1121,7 +1103,7 @@ dependencies = [ "itertools", "log", "md-5", - "rand", + "rand 0.9.2", "regex", "sha2", "unicode-segmentation", @@ -1130,9 +1112,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "408a05dafdc70d05a38a29005b8b15e21b0238734dab1e98483fcb58038c5aba" +checksum = "07331fc13603a9da97b74fd8a273f4238222943dffdbbed1c4c6f862a30105bf" dependencies = [ "ahash", "arrow", @@ -1151,9 +1133,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "756d21da2dd6c9bef97af1504970ff56cbf35d03fbd4ffd62827f02f4d2279d4" +checksum = "b5951e572a8610b89968a09b5420515a121fbc305c0258651f318dc07c97ab17" dependencies = [ "ahash", "arrow", @@ -1164,9 +1146,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d8d50f6334b378930d992d801a10ac5b3e93b846b39e4a05085742572844537" +checksum = "fdacca9302c3d8fc03f3e94f338767e786a88a33f5ebad6ffc0e7b50364b9ea3" dependencies = [ "arrow", "arrow-ord", @@ -1176,6 +1158,7 @@ dependencies = [ "datafusion-expr", "datafusion-functions", "datafusion-functions-aggregate", + "datafusion-functions-aggregate-common", "datafusion-macros", "datafusion-physical-expr-common", "itertools", @@ -1185,9 +1168,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9a97220736c8fff1446e936be90d57216c06f28969f9ffd3b72ac93c958c8a" +checksum = "8c37ff8a99434fbbad604a7e0669717c58c7c4f14c472d45067c4b016621d981" dependencies = [ "arrow", "async-trait", @@ -1201,10 +1184,11 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cefc2d77646e1aadd1d6a9c40088937aedec04e68c5f0465939912e1291f8193" +checksum = "48e2aea7c79c926cffabb13dc27309d4eaeb130f4a21c8ba91cdd241c813652b" dependencies = [ + "arrow", "datafusion-common", "datafusion-doc", "datafusion-expr", @@ -1218,9 +1202,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd4aff082c42fa6da99ce0698c85addd5252928c908eb087ca3cfa64ff16b313" +checksum = "0fead257ab5fd2ffc3b40fda64da307e20de0040fe43d49197241d9de82a487f" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1228,9 +1212,9 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df6f88d7ee27daf8b108ba910f9015176b36fbc72902b1ca5c2a5f1d1717e1a1" +checksum = "ec6f637bce95efac05cdfb9b6c19579ed4aa5f6b94d951cfa5bb054b7bb4f730" dependencies = [ "datafusion-expr", "quote", @@ -1239,14 +1223,15 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "084d9f979c4b155346d3c34b18f4256e6904ded508e9554d90fed416415c3515" +checksum = "c6583ef666ae000a613a837e69e456681a9faa96347bf3877661e9e89e141d8a" dependencies = [ "arrow", "chrono", "datafusion-common", "datafusion-expr", + "datafusion-expr-common", "datafusion-physical-expr", "indexmap", "itertools", @@ -1258,9 +1243,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64c536062b0076f4e30084065d805f389f9fe38af0ca75bcbac86bc5e9fbab65" +checksum = "c8668103361a272cbbe3a61f72eca60c9b7c706e87cc3565bcf21e2b277b84f6" dependencies = [ "ahash", "arrow", @@ -1274,15 +1259,31 @@ dependencies = [ "indexmap", "itertools", "log", + "parking_lot", "paste", "petgraph", ] +[[package]] +name = "datafusion-physical-expr-adapter" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "815acced725d30601b397e39958e0e55630e0a10d66ef7769c14ae6597298bb0" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-functions", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "itertools", +] + [[package]] name = "datafusion-physical-expr-common" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8a92b53b3193fac1916a1c5b8e3f4347c526f6822e56b71faa5fb372327a863" +checksum = "6652fe7b5bf87e85ed175f571745305565da2c0b599d98e697bcbedc7baa47c3" dependencies = [ "ahash", "arrow", @@ -1294,9 +1295,9 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fa0a5ac94c7cf3da97bedabd69d6bbca12aef84b9b37e6e9e8c25286511b5e2" +checksum = "49b7d623eb6162a3332b564a0907ba00895c505d101b99af78345f1acf929b5c" dependencies = [ "arrow", "datafusion-common", @@ -1306,6 +1307,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", + "datafusion-pruning", "itertools", "log", "recursive", @@ -1313,9 +1315,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "690c615db468c2e5fe5085b232d8b1c088299a6c63d87fd960a354a71f7acb55" +checksum = "e2f7f778a1a838dec124efb96eae6144237d546945587557c9e6936b3414558c" dependencies = [ "ahash", "arrow", @@ -1327,6 +1329,7 @@ dependencies = [ "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", + "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr", "datafusion-physical-expr-common", @@ -1341,11 +1344,29 @@ dependencies = [ "tokio", ] +[[package]] +name = "datafusion-pruning" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1e59e2ca14fe3c30f141600b10ad8815e2856caa59ebbd0e3e07cd3d127a65" +dependencies = [ + "arrow", + "arrow-schema", + "datafusion-common", + "datafusion-datasource", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "itertools", + "log", +] + [[package]] name = "datafusion-session" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad229a134c7406c057ece00c8743c0c34b97f4e72f78b475fe17b66c5e14fa4f" +checksum = "21ef8e2745583619bd7a49474e8f45fbe98ebb31a133f27802217125a7b3d58d" dependencies = [ "arrow", "async-trait", @@ -1367,9 +1388,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "47.0.0" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64f6ab28b72b664c21a27b22a2ff815fd390ed224c26e89a93b5a8154a4e8607" +checksum = "89abd9868770386fede29e5a4b14f49c0bf48d652c3b9d7a8a0332329b87d50b" dependencies = [ "arrow", "bigdecimal", @@ -1418,12 +1439,12 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "errno" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -1432,6 +1453,12 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "find-msvc-tools" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127" + [[package]] name = "fixedbitset" version = "0.5.7" @@ -1440,9 +1467,9 @@ checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" [[package]] name = "flatbuffers" -version = "25.2.10" +version = "25.9.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1" +checksum = "09b6620799e7340ebd9968d2e0708eb82cf1971e9a16821e2091b6d6e475eed5" dependencies = [ "bitflags", "rustc_version", @@ -1450,9 +1477,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.1.2" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a3d7db9596fecd151c5f638c0ee5d5bd487b6e0ea232e5dc96d5250f6f94b1d" +checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" dependencies = [ "crc32fast", "libz-rs-sys", @@ -1477,11 +1504,17 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" dependencies = [ "percent-encoding", ] @@ -1592,43 +1625,40 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" dependencies = [ "cfg-if", + "js-sys", "libc", - "wasi 0.11.1+wasi-snapshot-preview1", + "wasi", + "wasm-bindgen", ] [[package]] name = "getrandom" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "libc", "r-efi", - "wasi 0.14.2+wasi-0.2.4", + "wasip2", ] -[[package]] -name = "gimli" -version = "0.31.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" - [[package]] name = "glob" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" [[package]] name = "half" -version = "2.6.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ "cfg-if", "crunchy", "num-traits", + "zerocopy", ] [[package]] @@ -1643,9 +1673,24 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.4" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash 0.1.5", +] + +[[package]] +name = "hashbrown" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" + +[[package]] +name = "heck" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "hex" @@ -1666,15 +1711,15 @@ dependencies = [ [[package]] name = "humantime" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b112acc8b3adf4b107a8ec20977da0273a8c386765a3ec0229bd500a1443f9f" +checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" [[package]] name = "iana-time-zone" -version = "0.1.63" +version = "0.1.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0c919e5debc312ad217002b8048a17b7d83f80703865bbfcfebb0458b0b27d8" +checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb" dependencies = [ "android_system_properties", "core-foundation-sys", @@ -1696,9 +1741,9 @@ dependencies = [ [[package]] name = "icu_collections" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" +checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" dependencies = [ "displaydoc", "potential_utf", @@ -1709,9 +1754,9 @@ dependencies = [ [[package]] name = "icu_locale_core" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" +checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" dependencies = [ "displaydoc", "litemap", @@ -1722,11 +1767,10 @@ dependencies = [ [[package]] name = "icu_normalizer" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" +checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" dependencies = [ - "displaydoc", "icu_collections", "icu_normalizer_data", "icu_properties", @@ -1737,42 +1781,38 @@ dependencies = [ [[package]] name = "icu_normalizer_data" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" +checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" [[package]] name = "icu_properties" -version = "2.0.1" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" +checksum = "e93fcd3157766c0c8da2f8cff6ce651a31f0810eaa1c51ec363ef790bbb5fb99" dependencies = [ - "displaydoc", "icu_collections", "icu_locale_core", "icu_properties_data", "icu_provider", - "potential_utf", "zerotrie", "zerovec", ] [[package]] name = "icu_properties_data" -version = "2.0.1" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" +checksum = "02845b3647bb045f1100ecd6480ff52f34c35f82d9880e029d329c21d1054899" [[package]] name = "icu_provider" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" +checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" dependencies = [ "displaydoc", "icu_locale_core", - "stable_deref_trait", - "tinystr", "writeable", "yoke", "zerofrom", @@ -1782,9 +1822,9 @@ dependencies = [ [[package]] name = "idna" -version = "1.0.3" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" dependencies = [ "idna_adapter", "smallvec", @@ -1803,12 +1843,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.9.0" +version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" +checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f" dependencies = [ "equivalent", - "hashbrown 0.15.4", + "hashbrown 0.16.0", ] [[package]] @@ -1838,19 +1878,19 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "jobserver" -version = "0.1.33" +version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" dependencies = [ - "getrandom 0.3.3", + "getrandom 0.3.4", "libc", ] [[package]] name = "js-sys" -version = "0.3.77" +version = "0.3.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65" dependencies = [ "once_cell", "wasm-bindgen", @@ -1864,9 +1904,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lexical-core" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -1877,60 +1917,59 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" dependencies = [ "lexical-parse-integer", "lexical-util", - "static_assertions", ] [[package]] name = "lexical-parse-integer" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" dependencies = [ "lexical-util", - "static_assertions", ] [[package]] name = "lexical-util" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" -dependencies = [ - "static_assertions", -] +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" [[package]] name = "lexical-write-float" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" dependencies = [ "lexical-util", "lexical-write-integer", - "static_assertions", ] [[package]] name = "lexical-write-integer" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" dependencies = [ "lexical-util", - "static_assertions", ] +[[package]] +name = "libbz2-rs-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" + [[package]] name = "libc" -version = "0.2.174" +version = "0.2.177" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" +checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" [[package]] name = "libm" @@ -1940,49 +1979,48 @@ checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" [[package]] name = "libz-rs-sys" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "172a788537a2221661b480fee8dc5f96c580eb34fa88764d3205dc356c7e4221" +checksum = "840db8cf39d9ec4dd794376f38acc40d0fc65eec2a8f484f7fd375b84602becd" dependencies = [ "zlib-rs", ] [[package]] name = "link-cplusplus" -version = "1.0.10" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a6f6da007f968f9def0d65a05b187e2960183de70c160204ecfccf0ee330212" +checksum = "7f78c730aaa7d0b9336a299029ea49f9ee53b0ed06e9202e8cb7db9bae7b8c82" dependencies = [ "cc", ] [[package]] name = "linux-raw-sys" -version = "0.9.4" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" [[package]] name = "litemap" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" [[package]] name = "lock_api" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" dependencies = [ - "autocfg", "scopeguard", ] [[package]] name = "log" -version = "0.4.27" +version = "0.4.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" +checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" [[package]] name = "lz4_flex" @@ -2016,9 +2054,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.5" +version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" [[package]] name = "miniz_oxide" @@ -2027,6 +2065,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ "adler2", + "simd-adler32", ] [[package]] @@ -2105,18 +2144,18 @@ dependencies = [ [[package]] name = "object" -version = "0.36.7" +version = "0.32.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" dependencies = [ "memchr", ] [[package]] name = "object_store" -version = "0.12.2" +version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7781f96d79ed0f961a7021424ab01840efbda64ae7a505aaea195efc91eaaec4" +checksum = "4c1be0c6c22ec0817cdc77d3842f721a17fd30ab6965001415b5402a74e6b740" dependencies = [ "async-trait", "bytes", @@ -2127,7 +2166,7 @@ dependencies = [ "itertools", "parking_lot", "percent-encoding", - "thiserror 2.0.12", + "thiserror 2.0.17", "tokio", "tracing", "url", @@ -2153,9 +2192,9 @@ dependencies = [ [[package]] name = "parking_lot" -version = "0.12.4" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70d58bf43669b5795d1576d0641cfb6fbb2057bf629506267a92807158584a13" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" dependencies = [ "lock_api", "parking_lot_core", @@ -2163,22 +2202,22 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.11" +version = "0.9.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" dependencies = [ "cfg-if", "libc", "redox_syscall", "smallvec", - "windows-targets 0.52.6", + "windows-link", ] [[package]] name = "parquet" -version = "55.1.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be7b2d778f6b841d37083ebdf32e33a524acde1266b5884a8ca29bf00dfa1231" +checksum = "f0dbd48ad52d7dccf8ea1b90a3ddbfaea4f69878dd7683e51c507d4bc52b5b27" dependencies = [ "ahash", "arrow-array", @@ -2195,12 +2234,13 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown 0.15.4", + "hashbrown 0.16.0", "lz4_flex", "num", "num-bigint", "object_store", "paste", + "ring", "seq-macro", "simdutf8", "snap", @@ -2210,15 +2250,6 @@ dependencies = [ "zstd", ] -[[package]] -name = "parse-zoneinfo" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f2a05b18d44e2957b88f96ba460715e295bc1d7510468a2f3d3b44535d26c24" -dependencies = [ - "regex", -] - [[package]] name = "paste" version = "1.0.15" @@ -2227,54 +2258,36 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "percent-encoding" -version = "2.3.1" +version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" [[package]] name = "petgraph" -version = "0.7.1" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ "fixedbitset", + "hashbrown 0.15.5", "indexmap", + "serde", ] [[package]] name = "phf" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" -dependencies = [ - "phf_shared", -] - -[[package]] -name = "phf_codegen" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" -dependencies = [ - "phf_generator", - "phf_shared", -] - -[[package]] -name = "phf_generator" -version = "0.11.3" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" dependencies = [ "phf_shared", - "rand", ] [[package]] name = "phf_shared" -version = "0.11.3" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" dependencies = [ "siphasher", ] @@ -2299,9 +2312,9 @@ checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" [[package]] name = "potential_utf" -version = "0.1.2" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585" +checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" dependencies = [ "zerovec", ] @@ -2317,9 +2330,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.95" +version = "1.0.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" dependencies = [ "unicode-ident", ] @@ -2335,8 +2348,8 @@ dependencies = [ "bitflags", "lazy_static", "num-traits", - "rand", - "rand_chacha", + "rand 0.8.5", + "rand_chacha 0.3.1", "rand_xorshift", "regex-syntax", "rusty-fork", @@ -2346,10 +2359,11 @@ dependencies = [ [[package]] name = "psm" -version = "0.1.26" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e944464ec8536cd1beb0bbfd96987eb5e3b72f2ecdafdc5c769a37f1fa2ae1f" +checksum = "d11f2fedc3b7dafdc2851bc52f277377c5473d378859be234bc7ebb593144d01" dependencies = [ + "ar_archive_writer", "cc", ] @@ -2361,9 +2375,9 @@ checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" [[package]] name = "quote" -version = "1.0.40" +version = "1.0.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" dependencies = [ "proc-macro2", ] @@ -2381,8 +2395,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha", - "rand_core", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.3", ] [[package]] @@ -2392,7 +2416,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.3", ] [[package]] @@ -2404,13 +2438,22 @@ dependencies = [ "getrandom 0.2.16", ] +[[package]] +name = "rand_core" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +dependencies = [ + "getrandom 0.3.4", +] + [[package]] name = "rand_xorshift" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f" dependencies = [ - "rand_core", + "rand_core 0.6.4", ] [[package]] @@ -2435,18 +2478,18 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.13" +version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d04b7d0ee6b4a0207a0a7adb104d23ecb0b47d6beae7152d0fa34b692b29fd6" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ "bitflags", ] [[package]] name = "regex" -version = "1.11.1" +version = "1.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" dependencies = [ "aho-corasick", "memchr", @@ -2456,9 +2499,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.9" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" dependencies = [ "aho-corasick", "memchr", @@ -2467,15 +2510,23 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.5" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" [[package]] -name = "rustc-demangle" -version = "0.1.25" +name = "ring" +version = "0.17.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "989e6739f80c4ad5b13e0fd7fe89531180375b18520cc8c82080e4dc4035b84f" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.16", + "libc", + "untrusted", + "windows-sys 0.52.0", +] [[package]] name = "rustc_version" @@ -2488,28 +2539,28 @@ dependencies = [ [[package]] name = "rustix" -version = "1.0.7" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266" +checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" dependencies = [ "bitflags", "errno", "libc", "linux-raw-sys", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] name = "rustversion" -version = "1.0.21" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "rusty-fork" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb3dcc6e454c328bb824492db107ab7c0ae8fcffe4ad210136ef014458c1bc4f" +checksum = "cc6bf79ff24e648f6da1f8d1f011e9cac26491b619e6b9280f2b47f1774e6ee2" dependencies = [ "fnv", "quick-error", @@ -2540,15 +2591,15 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "scratch" -version = "1.0.8" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f6280af86e5f559536da57a45ebc84948833b3bee313a7dd25232e09c878a52" +checksum = "d68f2ec51b097e4c1a75b681a8bec621909b5e91f15bb7b840c4f2f7b01148b2" [[package]] name = "semver" -version = "1.0.26" +version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" [[package]] name = "seq-macro" @@ -2558,18 +2609,28 @@ checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" [[package]] name = "serde" -version = "1.0.219" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.219" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", @@ -2578,14 +2639,15 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.140" +version = "1.0.145" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" dependencies = [ "itoa", "memchr", "ryu", "serde", + "serde_core", ] [[package]] @@ -2605,6 +2667,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "simd-adler32" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" + [[package]] name = "simdutf8" version = "0.1.5" @@ -2637,9 +2705,9 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "sqlparser" -version = "0.55.0" +version = "0.58.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4521174166bac1ff04fe16ef4524c70144cd29682a45978978ca3d7f4e0be11" +checksum = "ec4b661c54b1e4b603b37873a18c59920e4c51ea8ea2cf527d925424dbd4437c" dependencies = [ "log", "recursive", @@ -2659,15 +2727,15 @@ dependencies = [ [[package]] name = "stable_deref_trait" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" [[package]] name = "stacker" -version = "0.1.21" +version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cddb07e32ddb770749da91081d8d0ac3a16f1a569a18b20348cd371f5dead06b" +checksum = "e1f8b29fb42aafcea4edeeb6b2f2d7ecd0d969c48b4cf0d2e64aafc471dd6e59" dependencies = [ "cc", "cfg-if", @@ -2676,16 +2744,10 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - [[package]] name = "strategy-ext" version = "0.1.0" -source = "git+https://github.com/TileDB-Inc/tiledb-rs.git?branch=main#e418936fff551dd608e2a1b5e3c557f4c8e5d29d" +source = "git+https://github.com/TileDB-Inc/tiledb-rs.git?branch=main#1dafdf310ed2e8f4e314a40dff9f3ff46a22c64d" dependencies = [ "num-traits", "proptest", @@ -2697,6 +2759,25 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "subtle" version = "2.6.1" @@ -2705,9 +2786,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.104" +version = "2.0.110" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40" +checksum = "a99801b5bd34ede4cf3fc688c5919368fea4e4814a4664359503e6015b280aea" dependencies = [ "proc-macro2", "quote", @@ -2727,15 +2808,15 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.20.0" +version = "3.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1" +checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" dependencies = [ "fastrand", - "getrandom 0.3.3", + "getrandom 0.3.4", "once_cell", "rustix", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -2758,11 +2839,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.12" +version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" +checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" dependencies = [ - "thiserror-impl 2.0.12", + "thiserror-impl 2.0.17", ] [[package]] @@ -2778,9 +2859,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.12" +version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" +checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", @@ -2818,14 +2899,14 @@ dependencies = [ "iterator-ext", "itertools", "num-traits", - "thiserror 2.0.12", + "thiserror 2.0.17", "tiledb-cxx-interface", ] [[package]] name = "tiledb-common" version = "0.1.0" -source = "git+https://github.com/TileDB-Inc/tiledb-rs.git?branch=main#e418936fff551dd608e2a1b5e3c557f4c8e5d29d" +source = "git+https://github.com/TileDB-Inc/tiledb-rs.git?branch=main#1dafdf310ed2e8f4e314a40dff9f3ff46a22c64d" dependencies = [ "anyhow", "num-traits", @@ -2872,7 +2953,7 @@ dependencies = [ "datafusion", "itertools", "num-traits", - "thiserror 2.0.12", + "thiserror 2.0.17", "tiledb-arrow", "tiledb-cxx-interface", "tiledb-datatype", @@ -2881,7 +2962,7 @@ dependencies = [ [[package]] name = "tiledb-pod" version = "0.1.0" -source = "git+https://github.com/TileDB-Inc/tiledb-rs.git?branch=main#e418936fff551dd608e2a1b5e3c557f4c8e5d29d" +source = "git+https://github.com/TileDB-Inc/tiledb-rs.git?branch=main#1dafdf310ed2e8f4e314a40dff9f3ff46a22c64d" dependencies = [ "itertools", "num-traits", @@ -2897,7 +2978,7 @@ dependencies = [ [[package]] name = "tiledb-proc-macro" version = "0.1.0" -source = "git+https://github.com/TileDB-Inc/tiledb-rs.git?branch=main#e418936fff551dd608e2a1b5e3c557f4c8e5d29d" +source = "git+https://github.com/TileDB-Inc/tiledb-rs.git?branch=main#1dafdf310ed2e8f4e314a40dff9f3ff46a22c64d" dependencies = [ "proc-macro2", "quote", @@ -2907,7 +2988,7 @@ dependencies = [ [[package]] name = "tiledb-proptest-config" version = "0.1.0" -source = "git+https://github.com/TileDB-Inc/tiledb-rs.git?branch=main#e418936fff551dd608e2a1b5e3c557f4c8e5d29d" +source = "git+https://github.com/TileDB-Inc/tiledb-rs.git?branch=main#1dafdf310ed2e8f4e314a40dff9f3ff46a22c64d" [[package]] name = "tiledb-session" @@ -2919,7 +3000,7 @@ dependencies = [ "datafusion", "itertools", "num-traits", - "thiserror 2.0.12", + "thiserror 2.0.17", "tiledb-arrow", "tiledb-cxx-interface", "tiledb-expr", @@ -2928,7 +3009,7 @@ dependencies = [ [[package]] name = "tiledb-sys-defs" version = "0.1.0" -source = "git+https://github.com/TileDB-Inc/tiledb-rs.git?branch=main#e418936fff551dd608e2a1b5e3c557f4c8e5d29d" +source = "git+https://github.com/TileDB-Inc/tiledb-rs.git?branch=main#1dafdf310ed2e8f4e314a40dff9f3ff46a22c64d" [[package]] name = "tiledb-test-array-schema" @@ -3014,7 +3095,7 @@ dependencies = [ [[package]] name = "tiledb-utils" version = "0.1.0" -source = "git+https://github.com/TileDB-Inc/tiledb-rs.git?branch=main#e418936fff551dd608e2a1b5e3c557f4c8e5d29d" +source = "git+https://github.com/TileDB-Inc/tiledb-rs.git?branch=main#1dafdf310ed2e8f4e314a40dff9f3ff46a22c64d" dependencies = [ "float_next_after", "num-traits", @@ -3032,9 +3113,9 @@ dependencies = [ [[package]] name = "tinystr" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" dependencies = [ "displaydoc", "zerovec", @@ -3042,11 +3123,10 @@ dependencies = [ [[package]] name = "tokio" -version = "1.45.1" +version = "1.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75ef51a33ef1da925cea3e4eb122833cb377c61439ca401b770f54902b806779" +checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408" dependencies = [ - "backtrace", "bytes", "pin-project-lite", "tokio-macros", @@ -3054,9 +3134,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" +checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", @@ -3065,9 +3145,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.15" +version = "0.7.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66a539a9ad6d5d281510d5bd368c973d636c02dbf8a67300bfb6b950696ad7df" +checksum = "2efa149fe76073d6e8fd97ef4f4eca7b67f599660115591483572e406e165594" dependencies = [ "bytes", "futures-core", @@ -3109,15 +3189,15 @@ dependencies = [ [[package]] name = "twox-hash" -version = "2.1.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b907da542cbced5261bd3256de1b3a1bf340a3d37f93425a07362a1d687de56" +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" [[package]] name = "typenum" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" [[package]] name = "unarray" @@ -3127,9 +3207,9 @@ checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" [[package]] name = "unicode-ident" -version = "1.0.18" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" +checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" [[package]] name = "unicode-segmentation" @@ -3139,25 +3219,26 @@ checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-width" -version = "0.1.14" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" [[package]] -name = "unicode-width" -version = "0.2.1" +name = "untrusted" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.4" +version = "2.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" +checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b" dependencies = [ "form_urlencoded", "idna", "percent-encoding", + "serde", ] [[package]] @@ -3168,11 +3249,11 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] name = "uuid" -version = "1.17.0" +version = "1.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cf4199d1e5d15ddd86a694e4d0dffa9c323ce759fea589f00fef9d81cc1931d" +checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" dependencies = [ - "getrandom 0.3.3", + "getrandom 0.3.4", "js-sys", "wasm-bindgen", ] @@ -3209,45 +3290,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] -name = "wasi" -version = "0.14.2+wasi-0.2.4" +name = "wasip2" +version = "1.0.1+wasi-0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" +checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" dependencies = [ - "wit-bindgen-rt", + "wit-bindgen", ] [[package]] name = "wasm-bindgen" -version = "0.2.100" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +checksum = "da95793dfc411fbbd93f5be7715b0578ec61fe87cb1a42b12eb625caa5c5ea60" dependencies = [ "cfg-if", "once_cell", "rustversion", "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" -dependencies = [ - "bumpalo", - "log", - "proc-macro2", - "quote", - "syn", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.50" +version = "0.4.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61" +checksum = "551f88106c6d5e7ccc7cd9a16f312dd3b5d36ea8b4954304657d5dfba115d4a0" dependencies = [ "cfg-if", "js-sys", @@ -3258,9 +3326,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.100" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +checksum = "04264334509e04a7bf8690f2384ef5265f05143a4bff3889ab7a3269adab59c2" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3268,31 +3336,31 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.100" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +checksum = "420bc339d9f322e562942d52e115d57e950d12d88983a14c79b86859ee6c7ebc" dependencies = [ + "bumpalo", "proc-macro2", "quote", "syn", - "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.100" +version = "0.2.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +checksum = "76f218a38c84bcb33c25ec7059b07847d465ce0e0a76b995e134a45adcb6af76" dependencies = [ "unicode-ident", ] [[package]] name = "web-sys" -version = "0.3.77" +version = "0.3.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" +checksum = "3a1f95c0d03a47f4ae1f7a64643a6bb97465d9b740f0fa8f90ea33915c99a9a1" dependencies = [ "js-sys", "wasm-bindgen", @@ -3310,18 +3378,18 @@ dependencies = [ [[package]] name = "winapi-util" -version = "0.1.9" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] name = "windows-core" -version = "0.61.2" +version = "0.62.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" dependencies = [ "windows-implement", "windows-interface", @@ -3332,9 +3400,9 @@ dependencies = [ [[package]] name = "windows-implement" -version = "0.60.0" +version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", @@ -3343,9 +3411,9 @@ dependencies = [ [[package]] name = "windows-interface" -version = "0.59.1" +version = "0.59.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", @@ -3354,76 +3422,69 @@ dependencies = [ [[package]] name = "windows-link" -version = "0.1.3" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" [[package]] name = "windows-result" -version = "0.3.4" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" dependencies = [ "windows-link", ] [[package]] name = "windows-strings" -version = "0.4.2" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" dependencies = [ "windows-link", ] [[package]] name = "windows-sys" -version = "0.59.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.6", + "windows-targets", ] [[package]] name = "windows-sys" -version = "0.60.2" +version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets 0.53.2", + "windows-targets", ] [[package]] -name = "windows-targets" -version = "0.52.6" +name = "windows-sys" +version = "0.61.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" dependencies = [ - "windows_aarch64_gnullvm 0.52.6", - "windows_aarch64_msvc 0.52.6", - "windows_i686_gnu 0.52.6", - "windows_i686_gnullvm 0.52.6", - "windows_i686_msvc 0.52.6", - "windows_x86_64_gnu 0.52.6", - "windows_x86_64_gnullvm 0.52.6", - "windows_x86_64_msvc 0.52.6", + "windows-link", ] [[package]] name = "windows-targets" -version = "0.53.2" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c66f69fcc9ce11da9966ddb31a40968cad001c5bedeb5c2b82ede4253ab48aef" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm 0.53.0", - "windows_aarch64_msvc 0.53.0", - "windows_i686_gnu 0.53.0", - "windows_i686_gnullvm 0.53.0", - "windows_i686_msvc 0.53.0", - "windows_x86_64_gnu 0.53.0", - "windows_x86_64_gnullvm 0.53.0", - "windows_x86_64_msvc 0.53.0", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] [[package]] @@ -3432,84 +3493,42 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" - [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" -[[package]] -name = "windows_aarch64_msvc" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" - [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" -[[package]] -name = "windows_i686_gnu" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" - [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" -[[package]] -name = "windows_i686_gnullvm" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" - [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" -[[package]] -name = "windows_i686_msvc" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" - [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" -[[package]] -name = "windows_x86_64_gnu" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" - [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" - [[package]] name = "windows_x86_64_msvc" version = "0.52.6" @@ -3517,25 +3536,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] -name = "windows_x86_64_msvc" -version = "0.53.0" +name = "wit-bindgen" +version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" - -[[package]] -name = "wit-bindgen-rt" -version = "0.39.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" -dependencies = [ - "bitflags", -] +checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" [[package]] name = "writeable" -version = "0.6.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" +checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" [[package]] name = "xz2" @@ -3548,11 +3558,10 @@ dependencies = [ [[package]] name = "yoke" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" +checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" dependencies = [ - "serde", "stable_deref_trait", "yoke-derive", "zerofrom", @@ -3560,9 +3569,9 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" +checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" dependencies = [ "proc-macro2", "quote", @@ -3572,18 +3581,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.26" +version = "0.8.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" +checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.26" +version = "0.8.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" +checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" dependencies = [ "proc-macro2", "quote", @@ -3613,9 +3622,9 @@ dependencies = [ [[package]] name = "zerotrie" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" dependencies = [ "displaydoc", "yoke", @@ -3624,9 +3633,9 @@ dependencies = [ [[package]] name = "zerovec" -version = "0.11.2" +version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428" +checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" dependencies = [ "yoke", "zerofrom", @@ -3635,9 +3644,9 @@ dependencies = [ [[package]] name = "zerovec-derive" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" +checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" dependencies = [ "proc-macro2", "quote", @@ -3646,9 +3655,9 @@ dependencies = [ [[package]] name = "zlib-rs" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "626bd9fa9734751fc50d6060752170984d7053f5a39061f524cda68023d4db8a" +checksum = "2f06ae92f42f5e5c42443fd094f245eb656abf56dd7cce9b8b263236565e00f2" [[package]] name = "zstd" @@ -3670,9 +3679,9 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "2.0.15+zstd.1.5.7" +version = "2.0.16+zstd.1.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" dependencies = [ "cc", "pkg-config", diff --git a/tiledb/oxidize/Cargo.toml b/tiledb/oxidize/Cargo.toml index a65d844a55f..1a19ea4fc9a 100644 --- a/tiledb/oxidize/Cargo.toml +++ b/tiledb/oxidize/Cargo.toml @@ -24,10 +24,10 @@ version = "0.1.0" [workspace.dependencies] anyhow = "1" -arrow = { version = "55" } -cxx = "=1.0.138" -cxx-build = "1.0.138" -datafusion = { version = "47", features = [] } +arrow = { version = "56" } +cxx = "1" +cxx-build = "1" +datafusion = { version = "50", features = [] } iterator-ext = { path = "iterator-ext" } itertools = { version = "0.14" } num-traits = "0.2" diff --git a/tiledb/oxidize/expr/src/logical_expr.rs b/tiledb/oxidize/expr/src/logical_expr.rs index d3f60f56eb8..8274b4cc1bc 100644 --- a/tiledb/oxidize/expr/src/logical_expr.rs +++ b/tiledb/oxidize/expr/src/logical_expr.rs @@ -72,7 +72,7 @@ impl Display for LogicalExpr { pub fn make_conjunction(exprs: &[Box]) -> Box { Box::new(LogicalExpr( datafusion::logical_expr::utils::conjunction(exprs.iter().map(|e| e.0.clone())) - .unwrap_or(Expr::Literal(ScalarValue::Boolean(Some(true)))), + .unwrap_or(Expr::Literal(ScalarValue::Boolean(Some(true)), None)), )) } diff --git a/tiledb/oxidize/expr/src/physical_expr.rs b/tiledb/oxidize/expr/src/physical_expr.rs index 44fdd0bee30..56ab2f2d0ff 100644 --- a/tiledb/oxidize/expr/src/physical_expr.rs +++ b/tiledb/oxidize/expr/src/physical_expr.rs @@ -94,7 +94,7 @@ impl PhysicalExprOutput { ), ColumnarValue::Array(a) => { ColumnarValue::Array(compute::kernels::cast::cast(a, &arrow_type).map_err(|e| { - PhysicalExprOutputError::Cast(DataFusionError::ArrowError(e, None)) + PhysicalExprOutputError::Cast(DataFusionError::ArrowError(Box::new(e), None)) })?) } }; diff --git a/tiledb/oxidize/expr/src/query_condition.rs b/tiledb/oxidize/expr/src/query_condition.rs index 41ef17c18bc..4377ac58825 100644 --- a/tiledb/oxidize/expr/src/query_condition.rs +++ b/tiledb/oxidize/expr/src/query_condition.rs @@ -210,7 +210,7 @@ fn leaf_ast_to_binary_expr( Ok(Expr::BinaryExpr(BinaryExpr { left: Box::new(column), op: operator, - right: Box::new(Expr::Literal(right)), + right: Box::new(Expr::Literal(right, None)), })) } @@ -245,7 +245,7 @@ fn leaf_ast_to_in_list(schema: &ArraySchema, ast: &ASTNode, negated: bool) -> Re let in_list = match field.cell_val_num() { CellValNum::Single => scalars .map(ScalarValue::from) - .map(Expr::Literal) + .map(|s| Expr::Literal(s, None)) .collect::>(), CellValNum::Fixed(nz) => { let fixed_size = nz.get() as usize; @@ -284,8 +284,7 @@ fn leaf_ast_to_in_list(schema: &ArraySchema, ast: &ASTNode, negated: bool) -> Re None, )) }) - .map(ScalarValue::FixedSizeList) - .map(Expr::Literal) + .map(|s| Expr::Literal(ScalarValue::FixedSizeList(s), None)) .collect::>() } CellValNum::Var => { @@ -322,8 +321,7 @@ fn leaf_ast_to_in_list(schema: &ArraySchema, ast: &ASTNode, negated: bool) -> Re None, )) }) - .map(ScalarValue::LargeList) - .map(Expr::Literal) + .map(|s| Expr::Literal(ScalarValue::LargeList(s), None)) .collect::>() } }; @@ -356,8 +354,7 @@ fn leaf_ast_to_in_list(schema: &ArraySchema, ast: &ASTNode, negated: bool) -> Re let elts = ast.get_data().as_slice()[w[0] as usize..w[1] as usize].to_vec(); String::from_utf8(elts).map_err(UserError::ExpectedUtf8) }) - .map_ok(|s| ScalarValue::LargeUtf8(Some(s))) - .map_ok(Expr::Literal) + .map_ok(|s| Expr::Literal(ScalarValue::LargeUtf8(Some(s)), None)) .collect::, _>>()?; Ok(Expr::InList(InList { @@ -395,16 +392,18 @@ fn leaf_ast_to_null_test(schema: &ArraySchema, ast: &ASTNode) -> Result Ok(Expr::Literal(ScalarValue::Boolean(Some(false)))), + QueryConditionOp::ALWAYS_FALSE => { + Ok(Expr::Literal(ScalarValue::Boolean(Some(false)), None)) + } QueryConditionOp::LT | QueryConditionOp::LE | QueryConditionOp::GT | QueryConditionOp::GE => { // TODO: are these invalid? - Ok(Expr::Literal(ScalarValue::Boolean(Some(false)))) + Ok(Expr::Literal(ScalarValue::Boolean(Some(false)), None)) } invalid => Err(InternalError::InvalidOp(invalid.repr.into()).into()), } @@ -532,10 +531,12 @@ fn to_datafusion_impl( // which we must replicate here Ok(Expr::IsNotNull(Box::new(column))) } else { - Ok(Expr::Literal(ScalarValue::Boolean(Some(true)))) + Ok(Expr::Literal(ScalarValue::Boolean(Some(true)), None)) } } - QueryConditionOp::ALWAYS_FALSE => Ok(Expr::Literal(ScalarValue::Boolean(Some(false)))), + QueryConditionOp::ALWAYS_FALSE => { + Ok(Expr::Literal(ScalarValue::Boolean(Some(false)), None)) + } invalid => Err(InternalError::InvalidOp(invalid.repr.into()).into()), } } From 66ff1fc39697d5ac591730e902fbfabcc37451b6 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Thu, 13 Nov 2025 14:45:52 -0500 Subject: [PATCH 48/52] Single QueryPredicates FFI boundary compiles, not tested --- test/src/unit-capi-config.cc | 4 - tiledb/oxidize/CMakeLists.txt | 13 +- tiledb/oxidize/Cargo.lock | 14 +- tiledb/oxidize/Cargo.toml | 4 +- tiledb/oxidize/arrow/src/enumeration.rs | 16 -- tiledb/oxidize/arrow/src/lib.rs | 70 ------- tiledb/oxidize/arrow/src/record_batch.rs | 24 +-- tiledb/oxidize/arrow/src/schema.rs | 33 --- tiledb/oxidize/expr/Cargo.toml | 11 - tiledb/oxidize/expr/src/lib.rs | 86 +------- tiledb/oxidize/expr/src/logical_expr.rs | 88 ++------ tiledb/oxidize/expr/src/physical_expr.rs | 155 -------------- .../{session => query-predicates}/Cargo.toml | 3 +- .../{expr => query-predicates}/build.rs | 0 tiledb/oxidize/query-predicates/src/lib.rs | 197 ++++++++++++++++++ tiledb/oxidize/session/build.rs | 4 - tiledb/oxidize/session/src/lib.rs | 115 ---------- .../staticlibs/core-objects/Cargo.toml | 2 +- .../staticlibs/core-objects/src/lib.rs | 2 +- .../query-condition/src/datafusion.rs} | 0 .../test-support/query-condition/src/lib.rs | 1 + tiledb/sm/config/config.cc | 3 - tiledb/sm/config/config.h | 3 - tiledb/sm/query/query.cc | 110 ++-------- tiledb/sm/query/query.h | 14 +- tiledb/sm/query/query_condition.cc | 122 +---------- tiledb/sm/query/query_condition.h | 96 ++------- tiledb/sm/query/strategy_base.h | 12 +- 28 files changed, 290 insertions(+), 912 deletions(-) delete mode 100644 tiledb/oxidize/expr/src/physical_expr.rs rename tiledb/oxidize/{session => query-predicates}/Cargo.toml (88%) rename tiledb/oxidize/{expr => query-predicates}/build.rs (100%) create mode 100644 tiledb/oxidize/query-predicates/src/lib.rs delete mode 100644 tiledb/oxidize/session/build.rs delete mode 100644 tiledb/oxidize/session/src/lib.rs rename tiledb/oxidize/{expr/src/query_condition.rs => test-support/query-condition/src/datafusion.rs} (100%) diff --git a/test/src/unit-capi-config.cc b/test/src/unit-capi-config.cc index 17167df355d..6e6c6b12163 100644 --- a/test/src/unit-capi-config.cc +++ b/test/src/unit-capi-config.cc @@ -291,8 +291,6 @@ void check_save_to_file() { ss << "sm.memory_budget_var 10737418240\n"; ss << "sm.merge_overlapping_ranges_experimental true\n"; ss << "sm.partial_tile_offsets_loading false\n"; - ss << "sm.query.condition_evaluator " << Config::SM_QUERY_CONDITION_EVALUATOR - << "\n"; ss << "sm.query.dense.qc_coords_mode false\n"; ss << "sm.query.dense.reader refactored\n"; ss << "sm.query.sparse_global_order.preprocess_tile_merge " @@ -648,8 +646,6 @@ TEST_CASE("C API: Test config iter", "[capi][config]") { all_param_values["sm.query.sparse_global_order.preprocess_tile_merge"] = Config::SM_QUERY_SPARSE_GLOBAL_ORDER_PREPROCESS_TILE_MERGE; all_param_values["sm.query.sparse_global_order.reader"] = "refactored"; - all_param_values["sm.query.condition_evaluator"] = - Config::SM_QUERY_CONDITION_EVALUATOR; all_param_values["sm.query.sparse_unordered_with_dups.reader"] = "refactored"; all_param_values["sm.mem.consolidation.buffers_weight"] = "1"; all_param_values["sm.mem.consolidation.reader_weight"] = "3"; diff --git a/tiledb/oxidize/CMakeLists.txt b/tiledb/oxidize/CMakeLists.txt index de9bc32f225..3266675d444 100644 --- a/tiledb/oxidize/CMakeLists.txt +++ b/tiledb/oxidize/CMakeLists.txt @@ -21,13 +21,6 @@ cxxbridge( lib.rs ) -cxxbridge( - NAME - expr - SOURCES - lib.rs -) - cxxbridge( NAME cxx-interface @@ -45,7 +38,7 @@ cxxbridge( cxxbridge( NAME - session + query-predicates SOURCES lib.rs ) @@ -68,8 +61,7 @@ oxidize( EXPORT arrow cxx-interface - expr - session + query-predicates ) oxidize( @@ -87,7 +79,6 @@ oxidize( EXPORT arrow cxx-interface - expr test-support-cxx-interface ) diff --git a/tiledb/oxidize/Cargo.lock b/tiledb/oxidize/Cargo.lock index cc42a117176..ce097d55352 100644 --- a/tiledb/oxidize/Cargo.lock +++ b/tiledb/oxidize/Cargo.lock @@ -2923,7 +2923,7 @@ version = "0.1.0" dependencies = [ "tiledb-arrow", "tiledb-expr", - "tiledb-session", + "tiledb-query-predicates", ] [[package]] @@ -2946,17 +2946,8 @@ dependencies = [ name = "tiledb-expr" version = "0.1.0" dependencies = [ - "anyhow", - "arrow", - "cxx", - "cxx-build", "datafusion", - "itertools", - "num-traits", - "thiserror 2.0.17", "tiledb-arrow", - "tiledb-cxx-interface", - "tiledb-datatype", ] [[package]] @@ -2991,10 +2982,11 @@ version = "0.1.0" source = "git+https://github.com/TileDB-Inc/tiledb-rs.git?branch=main#1dafdf310ed2e8f4e314a40dff9f3ff46a22c64d" [[package]] -name = "tiledb-session" +name = "tiledb-query-predicates" version = "0.1.0" dependencies = [ "anyhow", + "arrow", "cxx", "cxx-build", "datafusion", diff --git a/tiledb/oxidize/Cargo.toml b/tiledb/oxidize/Cargo.toml index 1a19ea4fc9a..e30351be48b 100644 --- a/tiledb/oxidize/Cargo.toml +++ b/tiledb/oxidize/Cargo.toml @@ -10,7 +10,7 @@ members = [ "staticlibs/core-objects", "staticlibs/unit-arithmetic", "staticlibs/unit-query-condition", - "session", + "query-predicates", "test-support/array-schema", "test-support/cxx-interface", "test-support/ffi", @@ -39,7 +39,7 @@ tiledb-cxx-interface = { path = "cxx-interface" } tiledb-datatype = { path = "datatype" } tiledb-expr = { path = "expr" } tiledb-pod = { git = "https://github.com/TileDB-Inc/tiledb-rs.git", branch = "main", features = [ "proptest-strategies" ] } -tiledb-session = { path = "session" } +tiledb-query-predicates = { path = "query-predicates" } tiledb-test-array-schema = { path = "test-support/array-schema" } tiledb-test-cells = { package = "cells", git = "https://github.com/TileDB-Inc/tiledb-rs.git", branch = "main", features = [ "proptest-strategies" ] } tiledb-test-ffi = { path = "test-support/ffi" } diff --git a/tiledb/oxidize/arrow/src/enumeration.rs b/tiledb/oxidize/arrow/src/enumeration.rs index 5ff15793a2b..808e74fafe9 100644 --- a/tiledb/oxidize/arrow/src/enumeration.rs +++ b/tiledb/oxidize/arrow/src/enumeration.rs @@ -15,22 +15,6 @@ pub enum Error { Variants(#[from] crate::record_batch::FieldError), } -/// Returns an [ArrowArray] whose elements are the variants of an [Enumeration]. -/// -/// # Safety -/// -/// When possible this function avoids copying data. This means that the -/// returned [ArrowArray] may reference data which lives inside the [Enumeration]. -/// It is not safe to use the value returned from this function after -/// the [Enumeration] is destructed. The caller must take care to abide this -/// requirement. Otherwise this function is safe to use. -pub unsafe fn array_from_enumeration_ffi( - enumeration: &Enumeration, -) -> Result, Error> { - let a = unsafe { array_from_enumeration(enumeration) }?; - Ok(Box::new(super::ArrowArray(a))) -} - /// Returns an [ArrowArray] whose elements are the variants of an [Enumeration]. /// /// # Safety diff --git a/tiledb/oxidize/arrow/src/lib.rs b/tiledb/oxidize/arrow/src/lib.rs index 49eca807335..13c9784778e 100644 --- a/tiledb/oxidize/arrow/src/lib.rs +++ b/tiledb/oxidize/arrow/src/lib.rs @@ -8,79 +8,9 @@ pub mod ffi { Storage, View, } - - #[namespace = "tiledb::sm"] - extern "C++" { - include!("tiledb/sm/array_schema/array_schema.h"); - include!("tiledb/sm/array_schema/enumeration.h"); - include!("tiledb/sm/query/readers/result_tile.h"); - - type ArraySchema = tiledb_cxx_interface::sm::array_schema::ArraySchema; - type Enumeration = tiledb_cxx_interface::sm::array_schema::Enumeration; - type ResultTile = tiledb_cxx_interface::sm::query::readers::ResultTile; - } - - #[namespace = "tiledb::oxidize::arrow::schema"] - extern "Rust" { - type ArrowArraySchema; - - #[cxx_name = "create"] - fn array_schema_create_arrow_schema( - schema: &ArraySchema, - which: WhichSchema, - ) -> Result>; - - #[cxx_name = "project"] - fn array_schema_project_arrow_schema( - schema: &ArraySchema, - which: WhichSchema, - select: &Vec, - ) -> Result>; - } - - #[namespace = "tiledb::oxidize::arrow::array"] - extern "Rust" { - type ArrowArray; - - #[cxx_name = "from_enumeration"] - unsafe fn array_from_enumeration_ffi(enumeration: &Enumeration) -> Result>; - } - - #[namespace = "tiledb::oxidize::arrow::record_batch"] - extern "Rust" { - type ArrowRecordBatch; - - #[cxx_name = "create"] - unsafe fn result_tile_to_record_batch( - schema: &ArrowArraySchema, - tile: &ResultTile, - ) -> Result>; - } } pub mod enumeration; pub mod offsets; pub mod record_batch; pub mod schema; - -use std::sync::Arc; - -use enumeration::array_from_enumeration_ffi; -use record_batch::{ArrowRecordBatch, to_record_batch as result_tile_to_record_batch}; -use schema::{ - ArrowArraySchema, cxx::project_arrow as array_schema_project_arrow_schema, - cxx::to_arrow as array_schema_create_arrow_schema, -}; - -/// Wraps a [dyn ArrowArray] for passing across the FFI boundary. -pub struct ArrowArray(pub Arc); - -unsafe impl cxx::ExternType for ArrowRecordBatch { - type Id = cxx::type_id!("tiledb::oxidize::arrow::record_batch::ArrowRecordBatch"); - type Kind = cxx::kind::Opaque; -} - -unsafe impl cxx::ExternType for ArrowArraySchema { - type Id = cxx::type_id!("tiledb::oxidize::arrow::schema::ArrowArraySchema"); - type Kind = cxx::kind::Opaque; -} diff --git a/tiledb/oxidize/arrow/src/record_batch.rs b/tiledb/oxidize/arrow/src/record_batch.rs index 6c94435534d..1107b1d7f6b 100644 --- a/tiledb/oxidize/arrow/src/record_batch.rs +++ b/tiledb/oxidize/arrow/src/record_batch.rs @@ -10,11 +10,10 @@ use arrow::array::{ PrimitiveArray, }; use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer, ScalarBuffer}; -use arrow::datatypes::{self as adt, ArrowPrimitiveType, Field}; +use arrow::datatypes::{self as adt, ArrowPrimitiveType, Field, Schema as ArrowSchema}; use arrow::record_batch::{RecordBatch, RecordBatchOptions}; use tiledb_cxx_interface::sm::query::readers::{ResultTile, TileTuple}; -use super::*; use crate::offsets::Error as OffsetsError; /// An error creating a [RecordBatch] to represent a [ResultTile]. @@ -45,11 +44,6 @@ pub enum FieldError { EnumerationNotSupported, } -/// Wraps a [RecordBatch] for passing across the FFI boundary. -pub struct ArrowRecordBatch { - pub arrow: RecordBatch, -} - /// Returns a [RecordBatch] which contains the same contents as a [ResultTile]. /// /// # Safety @@ -60,11 +54,10 @@ pub struct ArrowRecordBatch { /// long as the returned [RecordBatch] is not used after the [ResultTile] /// is destructed. pub unsafe fn to_record_batch( - schema: &ArrowArraySchema, + schema: &Arc, tile: &ResultTile, -) -> Result, Error> { +) -> Result { let columns = schema - .schema .fields() .iter() .map(|f| { @@ -91,13 +84,12 @@ pub unsafe fn to_record_batch( .collect::>, _>>()?; // SAFETY: should be clear from iteration - assert_eq!(schema.schema.fields().len(), columns.len()); + assert_eq!(schema.fields().len(), columns.len()); // SAFETY: `tile_to_arrow_array` must do this, major internal error if not // which is not recoverable assert!( schema - .schema .fields() .iter() .zip(columns.iter()) @@ -111,24 +103,24 @@ pub unsafe fn to_record_batch( assert!( columns.iter().all(|c| c.len() as u64 == tile.cell_num()), "Columns do not all have same number of cells: {:?} {:?}", - schema.schema.fields(), + schema.fields(), columns.iter().map(|c| c.len()).collect::>() ); // SAFETY: the four asserts above rule out each of the possible error conditions let arrow = if columns.is_empty() { RecordBatch::try_new_with_options( - Arc::clone(&schema.schema), + Arc::clone(&schema), columns, &RecordBatchOptions::new().with_row_count(Some(tile.cell_num() as usize)), ) } else { - RecordBatch::try_new(Arc::clone(&schema.schema), columns) + RecordBatch::try_new(Arc::clone(&schema), columns) }; let arrow = arrow.expect("Logic error: preconditions for constructing RecordBatch not met"); - Ok(Box::new(ArrowRecordBatch { arrow })) + Ok(arrow) } /// Returns an [ArrowArray] which contains the same contents as the provided diff --git a/tiledb/oxidize/arrow/src/schema.rs b/tiledb/oxidize/arrow/src/schema.rs index ba5ebe28f60..093470ea028 100644 --- a/tiledb/oxidize/arrow/src/schema.rs +++ b/tiledb/oxidize/arrow/src/schema.rs @@ -64,39 +64,6 @@ pub struct ArrowArraySchema { pub enumerations: Arc, } -pub mod cxx { - use super::*; - - pub fn to_arrow( - array_schema: &ArraySchema, - which: WhichSchema, - ) -> Result, Error> { - let (schema, enumerations) = super::project_arrow(array_schema, which, |_: &Field| true)?; - Ok(Box::new(ArrowArraySchema { - schema: Arc::new(schema), - enumerations: Arc::new(enumerations), - })) - } - - /// Returns a [Schema] which represents the physical field types of - /// the fields from `array_schema` which are contained in `select`. - // NB: we use `Vec` for facilitating the FFI boundary - #[allow(clippy::ptr_arg)] - pub fn project_arrow( - array_schema: &ArraySchema, - which: WhichSchema, - select: &Vec, - ) -> Result, Error> { - let (schema, enumerations) = super::project_arrow(array_schema, which, |field: &Field| { - select.iter().any(|s| s.as_str() == field.name_cxx()) - })?; - Ok(Box::new(ArrowArraySchema { - schema: Arc::new(schema), - enumerations: Arc::new(enumerations), - })) - } -} - pub fn to_arrow( array_schema: &ArraySchema, which: WhichSchema, diff --git a/tiledb/oxidize/expr/Cargo.toml b/tiledb/oxidize/expr/Cargo.toml index 9313852ce89..18a3f084d5e 100644 --- a/tiledb/oxidize/expr/Cargo.toml +++ b/tiledb/oxidize/expr/Cargo.toml @@ -5,16 +5,5 @@ rust-version = { workspace = true } version = { workspace = true } [dependencies] -anyhow = { workspace = true } -arrow = { workspace = true } -cxx = { workspace = true } datafusion = { workspace = true } -itertools = { workspace = true } -num-traits = { workspace = true } -thiserror = { workspace = true } tiledb-arrow = { workspace = true } -tiledb-cxx-interface = { workspace = true } -tiledb-datatype = { workspace = true } - -[build-dependencies] -cxx-build = { workspace = true } diff --git a/tiledb/oxidize/expr/src/lib.rs b/tiledb/oxidize/expr/src/lib.rs index b7f7355e81b..cfbd5121ce3 100644 --- a/tiledb/oxidize/expr/src/lib.rs +++ b/tiledb/oxidize/expr/src/lib.rs @@ -1,85 +1 @@ -#[cxx::bridge] -mod ffi { - #[namespace = "tiledb::sm"] - extern "C++" { - include!("tiledb/sm/array_schema/array_schema.h"); - include!("tiledb/sm/query/ast/query_ast.h"); - - type ArraySchema = tiledb_cxx_interface::sm::array_schema::ArraySchema; - type ASTNode = tiledb_cxx_interface::sm::query::ast::ASTNode; - type Datatype = tiledb_cxx_interface::sm::enums::Datatype; - } - - extern "C++" { - include!("tiledb/oxidize/arrow.h"); - - #[namespace = "tiledb::oxidize::arrow::record_batch"] - type ArrowRecordBatch = tiledb_arrow::record_batch::ArrowRecordBatch; - - #[namespace = "tiledb::oxidize::arrow::schema"] - type ArrowArraySchema = tiledb_arrow::schema::ArrowArraySchema; - - #[namespace = "tiledb::oxidize::arrow::schema"] - type WhichSchema = tiledb_arrow::schema::WhichSchema; - } - - #[namespace = "tiledb::oxidize::datafusion::logical_expr"] - extern "Rust" { - type LogicalExpr; - fn is_predicate(&self, schema: &ArraySchema) -> Result; - fn has_aggregate_functions(&self) -> bool; - fn to_string(&self) -> String; - - fn columns(&self) -> Vec; - - #[cxx_name = "create"] - fn query_condition_to_logical_expr( - schema: &ArraySchema, - which: &WhichSchema, - query_condition: &ASTNode, - ) -> Result>; - - /// Returns a conjunction of the logical exprs `e1 AND e2 AND ... AND eN`. - fn make_conjunction(exprs: &[Box]) -> Box; - } - - #[namespace = "tiledb::oxidize::datafusion::physical_expr"] - extern "Rust" { - type PhysicalExpr; - fn evaluate(&self, records: &ArrowRecordBatch) -> Result>; - - // TODO: we can avoid the double box using the trait object trick, - // see the pdavis 65154 branch - #[cxx_name = "create"] - fn create_physical_expr( - schema: &ArrowArraySchema, - expr: Box, - ) -> Result>; - } - - #[namespace = "tiledb::oxidize::datafusion::physical_expr"] - extern "Rust" { - type PhysicalExprOutput; - - fn is_scalar(&self) -> bool; - fn is_array(&self) -> bool; - - fn cast_to(&self, datatype: Datatype) -> Result>; - - fn values_u8(&self) -> Result<&[u8]>; - fn values_u64(&self) -> Result<&[u64]>; - } -} - -mod logical_expr; -mod physical_expr; -mod query_condition; - -pub use logical_expr::{LogicalExpr, make_conjunction}; -pub use physical_expr::{PhysicalExpr, PhysicalExprOutput, create_physical_expr}; -pub use query_condition::to_datafusion as query_condition_to_logical_expr; - -unsafe impl cxx::ExternType for LogicalExpr { - type Id = cxx::type_id!("tiledb::oxidize::datafusion::logical_expr::LogicalExpr"); - type Kind = cxx::kind::Opaque; -} +pub mod logical_expr; diff --git a/tiledb/oxidize/expr/src/logical_expr.rs b/tiledb/oxidize/expr/src/logical_expr.rs index 8274b4cc1bc..5b10d953a74 100644 --- a/tiledb/oxidize/expr/src/logical_expr.rs +++ b/tiledb/oxidize/expr/src/logical_expr.rs @@ -1,79 +1,25 @@ //! Provides definitions for interacting with DataFusion logical expressions. -use std::fmt::{Display, Formatter, Result as FmtResult}; - -use arrow::datatypes::DataType as ArrowDataType; +use datafusion::common::DataFusionError; use datafusion::common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}; -use datafusion::common::{Column, DFSchema, DataFusionError, ScalarValue}; -use datafusion::logical_expr::{Expr, ExprSchemable}; -use tiledb_arrow::schema::WhichSchema; -use tiledb_cxx_interface::sm::array_schema::ArraySchema; - -#[derive(Debug, thiserror::Error)] -pub enum TypeError { - #[error("Schema error: {0}")] - ArraySchema(#[from] tiledb_arrow::schema::Error), - #[error("Expression error: {0}")] - Expr(#[from] DataFusionError), -} - -/// Wraps a DataFusion [Expr] for passing across the FFI boundary. -pub struct LogicalExpr(pub Expr); - -impl LogicalExpr { - pub fn columns(&self) -> Vec { - self.0 - .column_refs() - .into_iter() - .map(|c| c.name.clone()) - .collect() - } - - pub fn output_type(&self, schema: &ArraySchema) -> Result { - let cols = self.0.column_refs(); - let arrow_schema = tiledb_arrow::schema::project_arrow(schema, WhichSchema::View, |f| { - let Ok(field_name) = f.name() else { - // NB: if the field name is not UTF-8 then it cannot possibly match the column name - return false; - }; - cols.contains(&Column::new_unqualified(field_name)) - })?; - let dfschema = { - // SAFETY: the only error we can get from the above is if the arrow schema - // has duplicate names, which will not happen since it was constructed from - // an ArraySchema which does not allow duplicate names - DFSchema::try_from(arrow_schema.0).unwrap() - }; - - Ok(self.0.get_type(&dfschema)?) - } - - pub fn has_aggregate_functions(&self) -> bool { - let rec = self.0.visit(&mut AggregateFunctionChecker::default()); - let rec = { - // SAFETY: AggregateFunctionChecker does not return any errors - rec.unwrap() - }; - matches!(rec, TreeNodeRecursion::Stop) - } - - pub fn is_predicate(&self, schema: &ArraySchema) -> Result { - Ok(matches!(self.output_type(schema)?, ArrowDataType::Boolean)) - } -} - -impl Display for LogicalExpr { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - self.0.human_display().fmt(f) - } +use datafusion::logical_expr::Expr; + +/// Returns a list of the names of the columns used in this expression. +pub fn columns(expr: &Expr) -> Vec { + expr.column_refs() + .into_iter() + .map(|c| c.name.clone()) + .collect() } -/// Returns a conjunction of the logical exprs `e1 AND e2 AND ... AND eN`. -pub fn make_conjunction(exprs: &[Box]) -> Box { - Box::new(LogicalExpr( - datafusion::logical_expr::utils::conjunction(exprs.iter().map(|e| e.0.clone())) - .unwrap_or(Expr::Literal(ScalarValue::Boolean(Some(true)), None)), - )) +/// Returns true if `expr` contains aggregate functions and false otherwise. +pub fn has_aggregate_functions(expr: &Expr) -> bool { + let rec = expr.visit(&mut AggregateFunctionChecker::default()); + let rec = { + // SAFETY: AggregateFunctionChecker does not return any errors + rec.unwrap() + }; + matches!(rec, TreeNodeRecursion::Stop) } #[derive(Default)] diff --git a/tiledb/oxidize/expr/src/physical_expr.rs b/tiledb/oxidize/expr/src/physical_expr.rs deleted file mode 100644 index 56ab2f2d0ff..00000000000 --- a/tiledb/oxidize/expr/src/physical_expr.rs +++ /dev/null @@ -1,155 +0,0 @@ -//! Provides definitions for compiling DataFusion logical expressions -//! into DataFusion physical expressions which can be evaluated; -//! and definitions for evaluating those physical expressions. - -use std::sync::Arc; - -use datafusion::common::arrow::datatypes::DataType as ArrowDataType; -use datafusion::common::arrow::{array as aa, compute, datatypes as adt}; -use datafusion::common::{DFSchema, DataFusionError, ScalarValue}; -use datafusion::execution::context::ExecutionProps; -use datafusion::logical_expr::ColumnarValue; -use datafusion::physical_plan::PhysicalExpr as DatafusionPhysicalExpr; -use tiledb_arrow::record_batch::ArrowRecordBatch; -use tiledb_arrow::schema::ArrowArraySchema; -use tiledb_cxx_interface::sm::enums::Datatype; - -use crate::LogicalExpr; - -/// An error using a [PhysicalExpr]. -#[derive(Debug, thiserror::Error)] -pub enum PhysicalExprError { - #[error("Compiling expression: {0}")] - Create(#[source] DataFusionError), - #[error("Evaluate expression: {0}")] - Evaluate(#[source] DataFusionError), -} - -/// An error using the output of physical expression evaluation. -#[derive(Debug, thiserror::Error)] -pub enum PhysicalExprOutputError { - #[error("Target type is unavailable: {0}")] - TypeUnavailable(#[source] tiledb_arrow::schema::FieldError), - #[error("Cast expression result: {0}")] - Cast(#[source] DataFusionError), - #[error("Cannot read array as static datatype '{0}': found '{1}'")] - InvalidStaticType(&'static str, ArrowDataType), -} - -/// Wraps a DataFusion [PhysicalExpr] for passing across the FFI boundary. -pub struct PhysicalExpr(Arc); - -impl PhysicalExpr { - pub fn evaluate( - &self, - records: &ArrowRecordBatch, - ) -> Result, PhysicalExprError> { - Ok(Box::new(PhysicalExprOutput( - self.0 - .evaluate(&records.arrow) - .map_err(PhysicalExprError::Evaluate)?, - ))) - } -} - -/// Returns a [PhysicalExpr] which evaluates a [LogicalExpr] for the given `schema`. -pub fn create_physical_expr( - schema: &ArrowArraySchema, - expr: Box, -) -> Result, PhysicalExprError> { - let dfschema = DFSchema::from_field_specific_qualified_schema( - vec![None; schema.schema.fields().len()], - &schema.schema, - ) - .map_err(PhysicalExprError::Create)?; - let dfexpr = - datafusion::physical_expr::create_physical_expr(&expr.0, &dfschema, &ExecutionProps::new()) - .map_err(PhysicalExprError::Create)?; - Ok(Box::new(PhysicalExpr(dfexpr))) -} - -/// Wraps the output of physical expression evaluation for passing across the FFI boundary. -pub struct PhysicalExprOutput(ColumnarValue); - -impl PhysicalExprOutput { - pub fn is_scalar(&self) -> bool { - matches!(self.0, ColumnarValue::Scalar(_)) - } - - pub fn is_array(&self) -> bool { - matches!(self.0, ColumnarValue::Array(_)) - } - - /// Cast `self` to a target datatype. - pub fn cast_to( - &self, - datatype: Datatype, - ) -> Result, PhysicalExprOutputError> { - let arrow_type = tiledb_arrow::schema::arrow_primitive_datatype(datatype) - .map_err(PhysicalExprOutputError::TypeUnavailable)?; - let columnar_value = match &self.0 { - ColumnarValue::Scalar(s) => ColumnarValue::Scalar( - s.cast_to(&arrow_type) - .map_err(PhysicalExprOutputError::Cast)?, - ), - ColumnarValue::Array(a) => { - ColumnarValue::Array(compute::kernels::cast::cast(a, &arrow_type).map_err(|e| { - PhysicalExprOutputError::Cast(DataFusionError::ArrowError(Box::new(e), None)) - })?) - } - }; - Ok(Box::new(PhysicalExprOutput(columnar_value))) - } - - /// Returns the result as a `&[u8]` if it is of that type, - /// and returns `Err` otherwise. - pub fn values_u8(&self) -> Result<&[u8], PhysicalExprOutputError> { - match &self.0 { - ColumnarValue::Scalar(s) => match s { - ScalarValue::UInt8(maybe_byte) => Ok(maybe_byte.as_slice()), - _ => Err(PhysicalExprOutputError::InvalidStaticType( - "u8", - s.data_type().clone(), - )), - }, - ColumnarValue::Array(a) => { - if *a.data_type() == adt::DataType::UInt8 { - // SAFETY: type check right above this - let primitive_array = a.as_any().downcast_ref::().unwrap(); - Ok(primitive_array.values().as_ref()) - } else { - Err(PhysicalExprOutputError::InvalidStaticType( - "u8", - a.data_type().clone(), - )) - } - } - } - } - - /// Returns the result as a `&[u64]` if it is of that type, - /// and returns `Err` otherwise. - pub fn values_u64(&self) -> Result<&[u64], PhysicalExprOutputError> { - match &self.0 { - ColumnarValue::Scalar(s) => match s { - ScalarValue::UInt64(maybe_value) => Ok(maybe_value.as_slice()), - _ => Err(PhysicalExprOutputError::InvalidStaticType( - "u64", - s.data_type().clone(), - )), - }, - ColumnarValue::Array(a) => { - if *a.data_type() == adt::DataType::UInt64 { - // SAFETY: type check right above this - let primitive_array = a.as_any().downcast_ref::().unwrap(); - Ok(primitive_array.values().as_ref()) - } else { - Err(PhysicalExprOutputError::InvalidStaticType( - "u64", - a.data_type().clone(), - )) - } - } - } - } -} diff --git a/tiledb/oxidize/session/Cargo.toml b/tiledb/oxidize/query-predicates/Cargo.toml similarity index 88% rename from tiledb/oxidize/session/Cargo.toml rename to tiledb/oxidize/query-predicates/Cargo.toml index e9976098a41..f085ec6a3dc 100644 --- a/tiledb/oxidize/session/Cargo.toml +++ b/tiledb/oxidize/query-predicates/Cargo.toml @@ -1,11 +1,12 @@ [package] -name = "tiledb-session" +name = "tiledb-query-predicates" edition = { workspace = true } rust-version = { workspace = true } version = { workspace = true } [dependencies] anyhow = { workspace = true } +arrow = { workspace = true } cxx = { workspace = true } datafusion = { workspace = true } itertools = { workspace = true } diff --git a/tiledb/oxidize/expr/build.rs b/tiledb/oxidize/query-predicates/build.rs similarity index 100% rename from tiledb/oxidize/expr/build.rs rename to tiledb/oxidize/query-predicates/build.rs diff --git a/tiledb/oxidize/query-predicates/src/lib.rs b/tiledb/oxidize/query-predicates/src/lib.rs new file mode 100644 index 00000000000..9c36ddcb467 --- /dev/null +++ b/tiledb/oxidize/query-predicates/src/lib.rs @@ -0,0 +1,197 @@ +#[cxx::bridge] +mod ffi { + #[namespace = "tiledb::sm"] + extern "C++" { + include!("tiledb/sm/array_schema/array_schema.h"); + include!("tiledb/sm/query/readers/result_tile.h"); + + type ArraySchema = tiledb_cxx_interface::sm::array_schema::ArraySchema; + type ResultTile = tiledb_cxx_interface::sm::query::readers::ResultTile; + } + + #[namespace = "tiledb::oxidize"] + extern "Rust" { + type QueryPredicates; + + #[cxx_name = "new_query_predicates"] + fn new_query_predicates_ffi(schema: &ArraySchema) -> Result>; + + fn add_predicate(&mut self, expr: &str) -> Result<()>; + + fn evaluate_into_bitmap(&self, tile: &ResultTile, bitmap: &mut [u8]) -> Result<()>; + } +} + +use std::sync::Arc; + +use arrow::datatypes::DataType; +use datafusion::common::tree_node::TreeNode; +use datafusion::common::{DFSchema, ScalarValue}; +use datafusion::execution::context::ExecutionProps; +use datafusion::execution::context::SessionContext; +use datafusion::execution::session_state::SessionStateBuilder; +use datafusion::logical_expr::ExprSchemable; +use datafusion::physical_plan::{ColumnarValue, PhysicalExpr}; +use tiledb_cxx_interface::sm::array_schema::ArraySchema; +use tiledb_cxx_interface::sm::query::readers::ResultTile; + +#[derive(Debug, thiserror::Error)] +pub enum ParseExprError { + #[error("Schema error: {0}")] + Schema(#[from] tiledb_arrow::schema::Error), +} + +#[derive(Debug, thiserror::Error)] +pub enum AddPredicateError { + #[error("Parse error: {0}")] + Parse(#[source] datafusion::common::DataFusionError), + #[error("Expression is not a predicate: found return type {0}")] + NotAPredicate(DataType), + #[error("Expression contains aggregate functions which are not supported in predicates")] + ContainsAggregateFunctions, + #[error("Type coercion error: {0}")] + TypeCoercion(#[source] datafusion::common::DataFusionError), + #[error("Output type error: {0}")] + OutputType(#[source] datafusion::common::DataFusionError), + #[error("Expression compile error: {0}")] + Compile(#[source] datafusion::common::DataFusionError), +} + +#[derive(Debug, thiserror::Error)] +pub enum EvaluatePredicateError { + #[error("Result tile error: {0}")] + ResultTile(#[from] tiledb_arrow::record_batch::Error), + #[error("Evaluation error: {0}")] + Evaluate(#[source] datafusion::common::DataFusionError), +} + +pub struct QueryPredicates { + dfsession: SessionContext, + dfschema: DFSchema, + predicate: Option>, +} + +impl QueryPredicates { + pub fn new(schema: &ArraySchema) -> Result { + let (arrow_schema, _) = + tiledb_arrow::schema::to_arrow(schema, tiledb_arrow::schema::WhichSchema::View)?; + let dfschema = { + // SAFETY: this only errors if the names are not unique, + // which they will be because `ArraySchema` requires it + DFSchema::try_from(arrow_schema).unwrap() + }; + + Ok(QueryPredicates { + dfsession: SessionContext::from( + SessionStateBuilder::new_with_default_features().build(), + ), + dfschema, + predicate: None, + }) + } + + pub fn add_predicate(&mut self, expr: &str) -> Result<(), AddPredicateError> { + let parsed_expr = self + .dfsession + .parse_sql_expr(expr, &self.dfschema) + .map_err(AddPredicateError::Parse)?; + + let mut coercion_rewriter = + datafusion::optimizer::analyzer::type_coercion::TypeCoercionRewriter::new( + &self.dfschema, + ); + let logical_expr = parsed_expr + .rewrite(&mut coercion_rewriter) + .map(|t| t.data) + .map_err(AddPredicateError::TypeCoercion)?; + + let output_type = logical_expr + .get_type(&self.dfschema) + .map_err(AddPredicateError::OutputType)?; + if output_type != DataType::Boolean { + return Err(AddPredicateError::NotAPredicate(output_type)); + } else if tiledb_expr::logical_expr::has_aggregate_functions(&logical_expr) { + return Err(AddPredicateError::ContainsAggregateFunctions); + } + let physical_expr = datafusion::physical_expr::create_physical_expr( + &logical_expr, + &self.dfschema, + &ExecutionProps::new(), + ) + .map_err(AddPredicateError::Compile)?; + + self.predicate = Some(datafusion::physical_expr::conjunction( + self.predicate + .take() + .into_iter() + .chain(std::iter::once(physical_expr)), + )); + Ok(()) + } + + pub fn evaluate(&self, tile: &ResultTile) -> Result { + let rb = unsafe { + // SAFETY: "This function is safe to call as long as the returned + // RecordBatch is not used after the ResultTile is destructed." + // The RecordBatch only lives in this stack frame, so we will follow this contract. + tiledb_arrow::record_batch::to_record_batch(self.dfschema.inner(), tile)? + }; + if let Some(p) = self.predicate.as_ref() { + Ok(p.evaluate(&rb).map_err(EvaluatePredicateError::Evaluate)?) + } else { + Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(true)))) + } + } + + fn evaluate_into_bitmap( + &self, + tile: &ResultTile, + bitmap: &mut [u8], + ) -> Result<(), EvaluatePredicateError> { + // TODO: consider not evaluating on cells where the bitmap is already set + + let result = self.evaluate(tile)?; + match result { + ColumnarValue::Scalar(s) => match s { + ScalarValue::Boolean(Some(true)) => { + // all cells pass predicates, no need to update bitmap + Ok(()) + } + ScalarValue::Boolean(Some(false)) => { + // no cells pass predicates, clear bitmap + bitmap.fill(0); + Ok(()) + } + ScalarValue::Boolean(None) => { + // no cells pass predicates, clear bitmap + bitmap.fill(0); + Ok(()) + } + _ => { + // should not be reachable due to return type check + unreachable!() + } + }, + ColumnarValue::Array(a) => { + if *a.data_type() == DataType::Boolean { + let bools = arrow::array::as_boolean_array(&a); + for (i, b) in bools.iter().enumerate() { + if !matches!(b, Some(true)) { + bitmap[i] = 0; + } + } + Ok(()) + } else { + // should not be reachable due to return type check + unreachable!() + } + } + } + } +} + +fn new_query_predicates_ffi( + schema: &ArraySchema, +) -> Result, tiledb_arrow::schema::Error> { + Ok(Box::new(QueryPredicates::new(schema)?)) +} diff --git a/tiledb/oxidize/session/build.rs b/tiledb/oxidize/session/build.rs deleted file mode 100644 index aa06e631fed..00000000000 --- a/tiledb/oxidize/session/build.rs +++ /dev/null @@ -1,4 +0,0 @@ -fn main() { - let _bridge = cxx_build::bridge("src/lib.rs"); - println!("cargo:rerun-if-changed=src/lib.rs"); -} diff --git a/tiledb/oxidize/session/src/lib.rs b/tiledb/oxidize/session/src/lib.rs deleted file mode 100644 index d3d0a454584..00000000000 --- a/tiledb/oxidize/session/src/lib.rs +++ /dev/null @@ -1,115 +0,0 @@ -#[cxx::bridge] -mod ffi { - #[namespace = "tiledb::sm"] - extern "C++" { - include!("tiledb/sm/array_schema/array_schema.h"); - - type ArraySchema = tiledb_cxx_interface::sm::array_schema::ArraySchema; - } - - #[namespace = "tiledb::oxidize::datafusion::logical_expr"] - extern "Rust" { - type ExternLogicalExpr; - } - - #[namespace = "tiledb::oxidize::datafusion::session"] - extern "Rust" { - type Session; - - fn new_session() -> Box; - - #[cxx_name = "parse_expr"] - fn parse_expr_ffi( - &self, - expr: &str, - array_schema: &ArraySchema, - ) -> Result>; - } -} - -/// Wraps for `tiledb_expr::logical_expr::LogicalExpr`. -// This ideally would not be necessary but a weakness of cxx is that it does -// not recognize that the same Rust type in different crates (or even modules) -// can map to the same C++ type. -// -// See https://github.com/dtolnay/cxx/issues/1323 -// -// We can fortunately work around this as follows: -// 1) `#[repr(transparent)]` ensures that the wrapper type has the same -// underlying representation as the wrapped type -// 2) `rust::Box::into_raw` and `rust::Box::from_raw` on the C++ side which -// allow us to cast the wrapper type into the wrapped type. -#[repr(transparent)] -struct ExternLogicalExpr(pub LogicalExpr); - -fn new_session() -> Box { - Box::new(Session::new()) -} - -use datafusion::common::DFSchema; -use datafusion::common::tree_node::TreeNode; -use datafusion::execution::context::SessionContext; -use datafusion::execution::session_state::SessionStateBuilder; -use datafusion::logical_expr::Expr; -use tiledb_cxx_interface::sm::array_schema::ArraySchema; -use tiledb_expr::LogicalExpr; - -#[derive(Debug, thiserror::Error)] -pub enum ParseExprError { - #[error("Schema error: {0}")] - Schema(#[from] tiledb_arrow::schema::Error), - #[error("Parse error: {0}")] - Parse(#[source] datafusion::common::DataFusionError), - #[error("Type coercion error: {0}")] - TypeCoercion(#[source] datafusion::common::DataFusionError), -} - -/// Wraps a DataFusion [SessionContext] for passing across the FFI boundary. -pub struct Session(pub SessionContext); - -impl Session { - pub fn new() -> Self { - Self(SessionContext::from( - SessionStateBuilder::new_with_default_features().build(), - )) - } - - fn parse_expr_ffi( - &self, - expr: &str, - array_schema: &ArraySchema, - ) -> Result, ParseExprError> { - let e = self.parse_expr(expr, array_schema)?; - Ok(Box::new(ExternLogicalExpr(LogicalExpr(e)))) - } - - fn parse_expr(&self, expr: &str, array_schema: &ArraySchema) -> Result { - let (arrow_schema, _) = - tiledb_arrow::schema::to_arrow(array_schema, tiledb_arrow::schema::WhichSchema::View)?; - let df_schema = { - // SAFETY: this only errors if the names are not unique, - // which they will be because `ArraySchema` requires it - DFSchema::try_from(arrow_schema).unwrap() - }; - - let parsed = self - .0 - .parse_sql_expr(expr, &df_schema) - .map_err(ParseExprError::Parse)?; - - let mut coercion_rewriter = - datafusion::optimizer::analyzer::type_coercion::TypeCoercionRewriter::new(&df_schema); - //.map_err(ParseExprError::TypeCoercion)?; - - parsed - .rewrite(&mut coercion_rewriter) - .map(|t| t.data) - .map_err(ParseExprError::TypeCoercion) - } -} - -impl Default for Session { - fn default() -> Self { - Self::new() - } -} diff --git a/tiledb/oxidize/staticlibs/core-objects/Cargo.toml b/tiledb/oxidize/staticlibs/core-objects/Cargo.toml index 51bc95648bc..d52d22db809 100644 --- a/tiledb/oxidize/staticlibs/core-objects/Cargo.toml +++ b/tiledb/oxidize/staticlibs/core-objects/Cargo.toml @@ -7,7 +7,7 @@ version = { workspace = true } [dependencies] tiledb-arrow = { workspace = true } tiledb-expr = { workspace = true } -tiledb-session = { workspace = true } +tiledb-query-predicates = { workspace = true } [lib] name = "tiledb_core_objects_rs" diff --git a/tiledb/oxidize/staticlibs/core-objects/src/lib.rs b/tiledb/oxidize/staticlibs/core-objects/src/lib.rs index 0f4513456cb..9f4f772e65c 100644 --- a/tiledb/oxidize/staticlibs/core-objects/src/lib.rs +++ b/tiledb/oxidize/staticlibs/core-objects/src/lib.rs @@ -1,3 +1,3 @@ pub use tiledb_arrow; pub use tiledb_expr; -pub use tiledb_session; +pub use tiledb_query_predicates; diff --git a/tiledb/oxidize/expr/src/query_condition.rs b/tiledb/oxidize/test-support/query-condition/src/datafusion.rs similarity index 100% rename from tiledb/oxidize/expr/src/query_condition.rs rename to tiledb/oxidize/test-support/query-condition/src/datafusion.rs diff --git a/tiledb/oxidize/test-support/query-condition/src/lib.rs b/tiledb/oxidize/test-support/query-condition/src/lib.rs index b10505b4bb3..a66df036967 100644 --- a/tiledb/oxidize/test-support/query-condition/src/lib.rs +++ b/tiledb/oxidize/test-support/query-condition/src/lib.rs @@ -4,6 +4,7 @@ //! This enables property-based testing against arbitrary query conditions //! using the strategies we have already written in `tiledb_common`. +mod datafusion; mod enums; use tiledb_common::query::condition::*; diff --git a/tiledb/sm/config/config.cc b/tiledb/sm/config/config.cc index 167e7d62609..6681ee24881 100644 --- a/tiledb/sm/config/config.cc +++ b/tiledb/sm/config/config.cc @@ -123,7 +123,6 @@ const std::string Config::SM_QUERY_SPARSE_GLOBAL_ORDER_PREPROCESS_TILE_MERGE = "32768"; const std::string Config::SM_QUERY_SPARSE_UNORDERED_WITH_DUPS_READER = "refactored"; -const std::string Config::SM_QUERY_CONDITION_EVALUATOR = "ast"; const std::string Config::SM_MEM_MALLOC_TRIM = "true"; const std::string Config::SM_UPPER_MEMORY_LIMIT = "1073741824"; // 1GB const std::string Config::SM_MEM_TOTAL_BUDGET = "10737418240"; // 10GB @@ -325,8 +324,6 @@ const std::map default_config_values = { std::make_pair( "sm.query.sparse_unordered_with_dups.reader", Config::SM_QUERY_SPARSE_UNORDERED_WITH_DUPS_READER), - std::make_pair( - "sm.query.condition_evaluator", Config::SM_QUERY_CONDITION_EVALUATOR), std::make_pair("sm.mem.malloc_trim", Config::SM_MEM_MALLOC_TRIM), std::make_pair( "sm.mem.tile_upper_memory_limit", Config::SM_UPPER_MEMORY_LIMIT), diff --git a/tiledb/sm/config/config.h b/tiledb/sm/config/config.h index 4893839d673..fb87debbc68 100644 --- a/tiledb/sm/config/config.h +++ b/tiledb/sm/config/config.h @@ -253,9 +253,6 @@ class Config { /** Which reader to use for sparse unordered with dups queries. */ static const std::string SM_QUERY_SPARSE_UNORDERED_WITH_DUPS_READER; - /** How to evaluate query conditions */ - static const std::string SM_QUERY_CONDITION_EVALUATOR; - /** Should malloc_trim be called on query/ctx destructors. */ static const std::string SM_MEM_MALLOC_TRIM; diff --git a/tiledb/sm/query/query.cc b/tiledb/sm/query/query.cc index 20c59678e09..eeb11b78f20 100644 --- a/tiledb/sm/query/query.cc +++ b/tiledb/sm/query/query.cc @@ -60,9 +60,7 @@ #include "tiledb/sm/tile/writer_tile_tuple.h" #ifdef HAVE_RUST -#include "tiledb/oxidize/arrow.h" -#include "tiledb/oxidize/expr.h" -#include "tiledb/oxidize/session.h" +#include "tiledb/oxidize/query_predicates.h" #endif #include @@ -694,7 +692,7 @@ void Query::init() { // Create dimension label queries and remove labels from subarray. if (uses_dimension_labels()) { - if (condition_.has_value()) { + if (predicates_.condition_.has_value()) { throw QueryException( "Cannot init query; Using query conditions and dimension labels " "together is not supported."); @@ -731,31 +729,6 @@ void Query::init() { fragment_name_)); } -#ifdef HAVE_RUST - if (!predicates_.empty()) { - try { - // treat existing query condition (if any) as datafusion - if (condition_.has_value()) { - predicates_.push_back(condition_->as_datafusion( - array_schema(), - tiledb::oxidize::arrow::schema::WhichSchema::View)); - condition_.reset(); - } - - // join them together - rust::Slice> - preds(predicates_.data(), predicates_.size()); - auto conjunction = - tiledb::oxidize::datafusion::logical_expr::make_conjunction(preds); - condition_.emplace(array_schema(), std::move(conjunction)); - } catch (const rust::Error& e) { - throw QueryException( - "Error initializing predicates: " + std::string(e.what())); - } - } -#endif - // Create the query strategy if querying main array and the Subarray does // not need to be updated. if (!only_dim_label_query() && !subarray_.has_label_ranges()) { @@ -793,7 +766,7 @@ const std::optional& Query::condition() const { "queries"); } - return condition_; + return predicates_.condition_; } const std::vector& Query::update_values() const { @@ -847,8 +820,8 @@ Status Query::process() { } } - if (condition_.has_value()) { - auto& names = condition_->enumeration_field_names(); + if (predicates_.condition_.has_value()) { + auto& names = predicates_.condition_->enumeration_field_names(); std::unordered_set deduped_enmr_names; for (auto name : names) { auto attr = array_schema_->attribute(name); @@ -872,30 +845,7 @@ Status Query::process() { return Status::Ok(); })); - condition_->rewrite_for_schema(array_schema()); - - // experimental feature - maybe evaluate using datafusion - const std::string evaluator_param_name = "sm.query.condition_evaluator"; - const auto evaluator = config_.get(evaluator_param_name); - if (evaluator == "datafusion") { -#ifdef HAVE_RUST - auto timer_se = - stats_->start_timer("query_condition_rewrite_to_datafusion"); - condition_->rewrite_to_datafusion( - array_schema(), tiledb::oxidize::arrow::schema::WhichSchema::Storage); -#else - std::stringstream ss; - ss << "Invalid value for parameter '" << evaluator_param_name - << "': 'datafusion' requires build configuration '-DTILEDB_RUST=ON'"; - throw QueryException(ss.str()); -#endif - } else if (evaluator.has_value() && evaluator != "ast") { - std::stringstream ss; - ss << "Invalid value for parameter '" << evaluator_param_name - << "': found '" << evaluator.value() - << "', expected 'datafusion' or 'ast'"; - throw QueryException(ss.str()); - } + predicates_.condition_->rewrite_for_schema(array_schema()); } if (type_ == QueryType::READ) { @@ -1522,7 +1472,7 @@ Status Query::set_condition(const QueryCondition& condition) { throw std::invalid_argument("Query conditions must not be empty"); } - condition_ = condition; + predicates_.condition_ = condition; return Status::Ok(); } @@ -1539,43 +1489,27 @@ Status Query::add_predicate([[maybe_unused]] const char* predicate) { "initialized query is not supported."); } -#ifdef HAVE_RUST - try { - if (!session_.has_value()) { - session_.emplace(tiledb::oxidize::datafusion::session::new_session()); - } - - auto box_extern_expr = (*session_)->parse_expr(predicate, array_schema()); - auto extern_expr = box_extern_expr.into_raw(); - - // NB: Rust cxx does not have a way to have crate A construct and return - // an opaque Rust type which is defined in crate B. So above we create an - // "ExternLogicalExpr" whose representation is exactly that of - // LogicalExpr, and we can transmute the raw pointer after un-boxing it. - // This is all quite unsafe but that's life at the FFI boundary. For now, - // hopefully. - using LogicalExpr = tiledb::oxidize::datafusion::logical_expr::LogicalExpr; - auto expr = rust::Box::from_raw( - reinterpret_cast(extern_expr)); - - if (!expr->is_predicate(array_schema())) { - return Status_QueryError("Expression does not return a boolean value"); - } - if (expr->has_aggregate_functions()) { +#ifndef HAVE_RUST + return Status_QueryError( + "Cannot add query predicate: feature requires build " + "configuration '-DTILEDB_RUST=ON'"); +#else + if (!predicates_.datafusion_.has_value()) { + try { + predicates_.datafusion_.emplace( + tiledb::oxidize::new_query_predicates(array_schema())); + } catch (const rust::Error& e) { return Status_QueryError( - "Aggregate functions in predicates are not supported"); + "Cannot add predicate: Schema error: " + std::string(e.what())); } - predicates_.push_back(std::move(expr)); + } + try { + predicates_.datafusion_.value()->add_predicate(predicate); } catch (const rust::Error& e) { return Status_QueryError( "Error adding predicate: " + std::string(e.what())); } - return Status::Ok(); -#else - return Status_QueryError( - "Cannot add query predicate: feature requires build " - "configuration '-DTILEDB_RUST=ON'"); #endif } @@ -1913,7 +1847,7 @@ Status Query::create_strategy(bool skip_checks_serialization) { aggregate_buffers_, subarray_, layout, - condition_, + predicates_, default_channel_aggregates_, skip_checks_serialization); if (type_ == QueryType::WRITE || type_ == QueryType::MODIFY_EXCLUSIVE) { diff --git a/tiledb/sm/query/query.h b/tiledb/sm/query/query.h index dfc20b3babf..5ce5c9ee3c2 100644 --- a/tiledb/sm/query/query.h +++ b/tiledb/sm/query/query.h @@ -1045,18 +1045,8 @@ class Query { /** Stores information about the written fragments. */ std::vector written_fragment_info_; - /** The query condition. */ - std::optional condition_; - -#ifdef HAVE_RUST - /** Datafusion context for parsing and evaluating predicates */ - std::optional> - session_; - - /** Predicates */ - std::vector> - predicates_; -#endif + /** Query predicates. */ + QueryPredicates predicates_; /** The update values. */ std::vector update_values_; diff --git a/tiledb/sm/query/query_condition.cc b/tiledb/sm/query/query_condition.cc index 66064b2367a..2b5d6e43a7f 100644 --- a/tiledb/sm/query/query_condition.cc +++ b/tiledb/sm/query/query_condition.cc @@ -42,8 +42,7 @@ #include "tiledb/sm/query/readers/result_cell_slab.h" #ifdef HAVE_RUST -#include "tiledb/oxidize/arrow.h" -#include "tiledb/oxidize/expr.h" +#include "tiledb/oxidize/query_predicates.h" #endif #include @@ -105,22 +104,6 @@ QueryCondition::QueryCondition( , tree_(std::move(tree)) { } -#ifdef HAVE_RUST -QueryCondition::QueryCondition( - const ArraySchema& array_schema, - rust::Box&& expr) { - const auto columns = expr->columns(); - for (const auto& c : columns) { - field_names_.insert(std::string(c.data(), c.size())); - } - - datafusion_.emplace( - array_schema, - tiledb::oxidize::arrow::schema::WhichSchema::View, - std::move(expr)); -} -#endif - QueryCondition::QueryCondition(const QueryCondition& rhs) : condition_marker_(rhs.condition_marker_) , condition_index_(rhs.condition_index_) @@ -183,6 +166,7 @@ void QueryCondition::rewrite_for_schema(const ArraySchema& array_schema) { tree_->rewrite_for_schema(array_schema); } +/* #ifdef HAVE_RUST rust::Box QueryCondition::as_datafusion( @@ -208,6 +192,7 @@ bool QueryCondition::rewrite_to_datafusion( return false; } #endif +*/ Status QueryCondition::check(const ArraySchema& array_schema) const { if (!tree_) { @@ -1335,14 +1320,6 @@ Status QueryCondition::apply( const std::vector>& fragment_metadata, std::vector& result_cell_slabs, const uint64_t stride) const { -#ifdef HAVE_RUST - if (!tree_ && datafusion_.has_value()) { - throw QueryConditionException( - "This query does not support predicates added with " - "tiledb_query_add_predicate"); - } -#endif - if (!tree_) { return Status::Ok(); } @@ -2171,13 +2148,6 @@ Status QueryCondition::apply_dense( return Status_QueryConditionError("The result buffer is null."); } -#ifdef HAVE_RUST - if (tree_ == nullptr && datafusion_.has_value()) { - return Status_QueryConditionError( - "tiledb_query_add_predicate is not supported for dense array queries"); - } -#endif - span result_span(result_buffer + start, length); apply_tree_dense( tree_, @@ -2952,26 +2922,8 @@ Status QueryCondition::apply_sparse( const QueryCondition::Params& params, const ResultTile& result_tile, std::span result_bitmap) { -#ifdef HAVE_RUST - if (datafusion_.has_value()) { - try { - datafusion_.value().apply(params, result_tile, result_bitmap); - } catch (const ::rust::Error& e) { - throw QueryConditionException( - "Error evaluating expression: " + std::string(e.what())); - } - } else { - apply_tree_sparse( - tree_, - params, - result_tile, - std::multiplies(), - result_bitmap); - } -#else apply_tree_sparse( tree_, params, result_tile, std::multiplies(), result_bitmap); -#endif return Status::Ok(); } @@ -2992,74 +2944,6 @@ uint64_t QueryCondition::condition_index() const { return condition_index_; } -#ifdef HAVE_RUST -QueryCondition::Datafusion::Datafusion( - const ArraySchema& array_schema, - tiledb::oxidize::arrow::schema::WhichSchema which, - rust::Box&& expr) - : schema_(tiledb::oxidize::arrow::schema::project( - array_schema, which, expr->columns())) - , expr_(tiledb::oxidize::datafusion::physical_expr::create( - *schema_, std::move(expr))) { -} - -template -void QueryCondition::Datafusion::apply( - const QueryCondition::Params&, - const ResultTile& result_tile, - std::span result_bitmap) const { - const auto arrow = - tiledb::oxidize::arrow::record_batch::create(*schema_, result_tile); - const auto predicate_eval = expr_->evaluate(*arrow); - static_assert( - std::is_same_v || - std::is_same_v); - if constexpr (std::is_same_v) { - const auto predicate_out_u8 = predicate_eval->cast_to(Datatype::UINT8); - const auto bitmap = predicate_out_u8->values_u8(); - if (predicate_out_u8->is_scalar() && bitmap.empty()) { - // all NULLs - for (auto& result : result_bitmap) { - result = 0; - } - } else if (predicate_out_u8->is_scalar()) { - // all the same value - for (auto& result : result_bitmap) { - result = result * bitmap[0]; - } - } else if (bitmap.size() == result_bitmap.size()) { - for (uint64_t i = 0; i < bitmap.size(); i++) { - result_bitmap[i] *= bitmap[i]; - } - } else { - throw QueryConditionException( - "Expression evaluation bitmap has unexpected size"); - } - } else { - const auto predicate_out_u64 = predicate_eval->cast_to(Datatype::UINT64); - const auto bitmap = predicate_out_u64->values_u64(); - if (predicate_out_u64->is_scalar() && bitmap.empty()) { - // all NULLs - for (auto& result : result_bitmap) { - result = 0; - } - } else if (predicate_out_u64->is_scalar()) { - // all the same value - for (auto& result : result_bitmap) { - result = result * bitmap[0]; - } - } else if (bitmap.size() == result_bitmap.size()) { - for (uint64_t i = 0; i < result_bitmap.size(); i++) { - result_bitmap[i] *= bitmap[i]; - } - } else { - throw QueryConditionException( - "Expression evaluation bitmap has unexpected size"); - } - } -} -#endif - // Explicit template instantiations. template Status QueryCondition::apply_sparse( const QueryCondition::Params&, const ResultTile&, std::span); diff --git a/tiledb/sm/query/query_condition.h b/tiledb/sm/query/query_condition.h index 6e210cdbc54..81af63c03c9 100644 --- a/tiledb/sm/query/query_condition.h +++ b/tiledb/sm/query/query_condition.h @@ -42,20 +42,14 @@ #include "tiledb/sm/query/ast/query_ast.h" #ifdef HAVE_RUST +#include "tiledb/oxidize/arrow.h" #include "tiledb/oxidize/rust.h" #endif using namespace tiledb::common; -namespace tiledb::oxidize::arrow::schema { -struct ArrowArraySchema; -enum class WhichSchema : uint8_t; -} // namespace tiledb::oxidize::arrow::schema -namespace tiledb::oxidize::datafusion::logical_expr { -struct LogicalExpr; -} -namespace tiledb::oxidize::datafusion::physical_expr { -struct PhysicalExpr; +namespace tiledb::oxidize { +struct QueryPredicates; } namespace tiledb { @@ -154,13 +148,6 @@ class QueryCondition { const std::string& condition_marker, tdb_unique_ptr&& tree); -#ifdef HAVE_RUST - /** Constructor from a datafusion expression tree */ - QueryCondition( - const ArraySchema& array_schema, - rust::Box&& expr); -#endif - /** Copy constructor. */ QueryCondition(const QueryCondition& rhs); @@ -208,34 +195,6 @@ class QueryCondition { */ void rewrite_for_schema(const ArraySchema& array_schema); -#ifdef HAVE_RUST - /** - * If desired and possible, rewrite the query condition to use Datafusion to - * evaluate. - * - * This is principally used for testing, but may also be called from - * production if a query has both a query condition and a datafusion predicate - * added. - * - * @param array_schema - * @param which The manner of interpreting the array_schema into Arrow - * - * @return true if a rewrite occurred, false otherwise - */ - bool rewrite_to_datafusion( - const ArraySchema& array_schema, - tiledb::oxidize::arrow::schema::WhichSchema which); - - /** - * @return an equivalent representation of this condition's expression tree as - * a Datafusion logical expression - */ - rust::Box - as_datafusion( - const ArraySchema& array_schema, - tiledb::oxidize::arrow::schema::WhichSchema which); -#endif - /** * Verifies that the current state contains supported comparison * operations. Currently, we support the following: @@ -426,36 +385,6 @@ class QueryCondition { /** AST Tree structure representing the condition. **/ tdb_unique_ptr tree_{}; -#ifdef HAVE_RUST - /** Datafusion expression evaluation */ - struct Datafusion { - using BoxSchema = - ::rust::Box; - using BoxExpr = - ::rust::Box; - BoxSchema schema_; - BoxExpr expr_; - - Datafusion(BoxSchema&& schema, BoxExpr&& expr) - : schema_(std::move(schema)) - , expr_(std::move(expr)) { - } - - Datafusion( - const ArraySchema& array_schema, - tiledb::oxidize::arrow::schema::WhichSchema which, - rust::Box&& - expr); - - template - void apply( - const QueryCondition::Params& params, - const ResultTile& result_tile, - std::span result_bitmap) const; - }; - std::optional datafusion_; -#endif - /** Caches all field names in the value nodes of the AST. */ mutable std::unordered_set field_names_; @@ -781,6 +710,25 @@ class QueryCondition { std::span result_bitmap) const; }; +/** + * + */ +struct QueryPredicates { + std::optional condition_; + +#ifdef HAVE_RUST + /** + * Query predicates. + * + * History lesson: + * QueryCondition was added first and provides a C API to construct expression + * trees. QueryPredicates was added later and uses DataFusion to parse text + * predicates and provide much broader evaluation capabilities. + */ + std::optional> datafusion_; +#endif +}; + } // namespace sm } // namespace tiledb diff --git a/tiledb/sm/query/strategy_base.h b/tiledb/sm/query/strategy_base.h index 9265af7d1ee..ff845d01753 100644 --- a/tiledb/sm/query/strategy_base.h +++ b/tiledb/sm/query/strategy_base.h @@ -37,6 +37,7 @@ #include "tiledb/common/status.h" #include "tiledb/sm/array_schema/dimension.h" #include "tiledb/sm/misc/types.h" +#include "tiledb/sm/query/query_condition.h" #include "tiledb/sm/storage_manager/cancellation_source.h" #include "tiledb/sm/storage_manager/context_resources.h" @@ -50,7 +51,6 @@ class LocalQueryStateMachine; class MemoryTracker; class Subarray; class QueryBuffer; -class QueryCondition; using DefaultChannelAggregates = std::unordered_map>; @@ -78,7 +78,7 @@ class StrategyParams { std::unordered_map& aggregate_buffers, Subarray& subarray, Layout layout, - std::optional& condition, + QueryPredicates& predicates, DefaultChannelAggregates& default_channel_aggregates, bool skip_checks_serialization) : resources_(resources) @@ -93,7 +93,7 @@ class StrategyParams { , aggregate_buffers_(aggregate_buffers) , subarray_(subarray) , layout_(layout) - , condition_(condition) + , predicates_(predicates) , default_channel_aggregates_(default_channel_aggregates) , skip_checks_serialization_(skip_checks_serialization) { } @@ -163,7 +163,7 @@ class StrategyParams { /** Return the condition. */ inline std::optional& condition() { - return condition_; + return predicates_.condition_; } /** Return the default channel aggregates. */ @@ -220,8 +220,8 @@ class StrategyParams { /** Layout of the cells in the result of the subarray. */ Layout layout_; - /** Query condition. */ - std::optional& condition_; + /** Query predicates. */ + QueryPredicates& predicates_; /** Default channel aggregates. */ DefaultChannelAggregates& default_channel_aggregates_; From 5d2521edf8ded350551bd382e474da761df83a25 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Fri, 14 Nov 2025 15:31:46 -0500 Subject: [PATCH 49/52] Single QueryPredicates FFI boundary passes existing tests --- test/src/unit-Reader.cc | 4 +- test/src/unit-query-add-predicate.cc | 19 +++---- tiledb/oxidize/query-predicates/src/lib.rs | 54 +++++++++++++++---- tiledb/sm/query/legacy/reader.cc | 23 +++++--- tiledb/sm/query/query_condition.cc | 22 ++++++++ tiledb/sm/query/query_condition.h | 17 +++++- tiledb/sm/query/readers/dense_reader.cc | 30 +++++++---- .../query/readers/ordered_dim_label_reader.cc | 2 +- tiledb/sm/query/readers/reader_base.cc | 2 +- tiledb/sm/query/readers/reader_base.h | 4 +- .../readers/sparse_global_order_reader.cc | 4 +- .../query/readers/sparse_index_reader_base.cc | 41 +++++++++++--- .../sparse_unordered_with_dups_reader.cc | 4 +- tiledb/sm/query/strategy_base.h | 4 ++ 14 files changed, 173 insertions(+), 57 deletions(-) diff --git a/test/src/unit-Reader.cc b/test/src/unit-Reader.cc index e942baa1189..b0e8f93c8af 100644 --- a/test/src/unit-Reader.cc +++ b/test/src/unit-Reader.cc @@ -163,7 +163,7 @@ TEST_CASE_METHOD( buffers.emplace( "a", tiledb::sm::QueryBuffer(nullptr, nullptr, &tmp_size, &tmp_size)); std::unordered_map aggregate_buffers; - std::optional condition; + QueryPredicates predicates; ThreadPool tp_cpu(4), tp_io(4); Array array(context.resources(), URI(array_name_)); CHECK(array.open(QueryType::READ, EncryptionType::NO_ENCRYPTION, nullptr, 0) @@ -183,7 +183,7 @@ TEST_CASE_METHOD( aggregate_buffers, subarray, Layout::ROW_MAJOR, - condition, + predicates, default_channel_aggregates, false); Reader reader(&g_helper_stats, g_helper_logger(), params); diff --git a/test/src/unit-query-add-predicate.cc b/test/src/unit-query-add-predicate.cc index 3fa1f178d7f..917345da81a 100644 --- a/test/src/unit-query-add-predicate.cc +++ b/test/src/unit-query-add-predicate.cc @@ -226,11 +226,9 @@ const Cells expect_a_is_null_and_v_starts_with_t = make_cells( auto matchEnumerationNotSupported(std::string enumeration_name = "e") { return Catch::Matchers::ContainsSubstring( - "QueryCondition: Error evaluating expression: Cannot process field " - "'" + + "Error evaluating expression: Data error: Cannot process field '" + enumeration_name + - "': Attributes with enumerations are not supported in text " - "predicates"); + "': Attributes with enumerations are not supported in text predicates"); } void QueryAddPredicateFx::create_array( @@ -448,7 +446,7 @@ TEST_CASE_METHOD( REQUIRE_THROWS_WITH( QueryExperimental::add_predicate(ctx_, query, {"row"}), Catch::Matchers::ContainsSubstring( - "Expression does not return a boolean value")); + "Expression is not a predicate: found return type UInt64")); } SECTION("Schema error") { @@ -479,7 +477,8 @@ TEST_CASE_METHOD( REQUIRE_THROWS_WITH( QueryExperimental::add_predicate(ctx_, query, {"sum(row) >= 10"}), Catch::Matchers::ContainsSubstring( - "Aggregate functions in predicates are not supported")); + "Expression contains aggregate functions which are not supported " + "in predicates")); } } } @@ -563,8 +562,7 @@ TEST_CASE_METHOD( write_array(array_name); const auto match = Catch::Matchers::ContainsSubstring( - "This query does not support predicates added with " - "tiledb_query_add_predicate"); + "tiledb_query_add_predicate is not supported for this query"); SECTION("Row major") { REQUIRE_THROWS_WITH( @@ -836,10 +834,7 @@ TEST_CASE_METHOD( // enumeration not supported yet REQUIRE_THROWS_WITH( query_array(array_name, query_order, {"e < 'california'"}), - Catch::Matchers::ContainsSubstring( - "QueryCondition: Error evaluating expression: Cannot process field " - "'e': Attributes with enumerations are not supported in text " - "predicates")); + matchEnumerationNotSupported()); } } diff --git a/tiledb/oxidize/query-predicates/src/lib.rs b/tiledb/oxidize/query-predicates/src/lib.rs index 9c36ddcb467..675cf9498e6 100644 --- a/tiledb/oxidize/query-predicates/src/lib.rs +++ b/tiledb/oxidize/query-predicates/src/lib.rs @@ -16,9 +16,12 @@ mod ffi { #[cxx_name = "new_query_predicates"] fn new_query_predicates_ffi(schema: &ArraySchema) -> Result>; + fn field_names(&self) -> Vec; + fn add_predicate(&mut self, expr: &str) -> Result<()>; - fn evaluate_into_bitmap(&self, tile: &ResultTile, bitmap: &mut [u8]) -> Result<()>; + fn evaluate_into_bitmap_u8(&self, tile: &ResultTile, bitmap: &mut [u8]) -> Result<()>; + fn evaluate_into_bitmap_u64(&self, tile: &ResultTile, bitmap: &mut [u64]) -> Result<()>; } } @@ -30,8 +33,10 @@ use datafusion::common::{DFSchema, ScalarValue}; use datafusion::execution::context::ExecutionProps; use datafusion::execution::context::SessionContext; use datafusion::execution::session_state::SessionStateBuilder; -use datafusion::logical_expr::ExprSchemable; +use datafusion::logical_expr::{Expr, ExprSchemable}; use datafusion::physical_plan::{ColumnarValue, PhysicalExpr}; +use itertools::Itertools; +use num_traits::Zero; use tiledb_cxx_interface::sm::array_schema::ArraySchema; use tiledb_cxx_interface::sm::query::readers::ResultTile; @@ -59,7 +64,7 @@ pub enum AddPredicateError { #[derive(Debug, thiserror::Error)] pub enum EvaluatePredicateError { - #[error("Result tile error: {0}")] + #[error("Data error: {0}")] ResultTile(#[from] tiledb_arrow::record_batch::Error), #[error("Evaluation error: {0}")] Evaluate(#[source] datafusion::common::DataFusionError), @@ -68,6 +73,7 @@ pub enum EvaluatePredicateError { pub struct QueryPredicates { dfsession: SessionContext, dfschema: DFSchema, + logical_exprs: Vec, predicate: Option>, } @@ -86,10 +92,20 @@ impl QueryPredicates { SessionStateBuilder::new_with_default_features().build(), ), dfschema, + logical_exprs: vec![], predicate: None, }) } + /// Returns a list of all of the field names used in all of the predicates + pub fn field_names(&self) -> Vec { + self.logical_exprs + .iter() + .flat_map(tiledb_expr::logical_expr::columns) + .unique() + .collect() + } + pub fn add_predicate(&mut self, expr: &str) -> Result<(), AddPredicateError> { let parsed_expr = self .dfsession @@ -120,6 +136,7 @@ impl QueryPredicates { ) .map_err(AddPredicateError::Compile)?; + self.logical_exprs.push(logical_expr); self.predicate = Some(datafusion::physical_expr::conjunction( self.predicate .take() @@ -143,11 +160,14 @@ impl QueryPredicates { } } - fn evaluate_into_bitmap( + fn evaluate_into_bitmap( &self, tile: &ResultTile, - bitmap: &mut [u8], - ) -> Result<(), EvaluatePredicateError> { + bitmap: &mut [T], + ) -> Result<(), EvaluatePredicateError> + where + T: Copy + Zero, + { // TODO: consider not evaluating on cells where the bitmap is already set let result = self.evaluate(tile)?; @@ -159,12 +179,12 @@ impl QueryPredicates { } ScalarValue::Boolean(Some(false)) => { // no cells pass predicates, clear bitmap - bitmap.fill(0); + bitmap.fill(T::zero()); Ok(()) } ScalarValue::Boolean(None) => { // no cells pass predicates, clear bitmap - bitmap.fill(0); + bitmap.fill(T::zero()); Ok(()) } _ => { @@ -177,7 +197,7 @@ impl QueryPredicates { let bools = arrow::array::as_boolean_array(&a); for (i, b) in bools.iter().enumerate() { if !matches!(b, Some(true)) { - bitmap[i] = 0; + bitmap[i] = T::zero(); } } Ok(()) @@ -188,6 +208,22 @@ impl QueryPredicates { } } } + + fn evaluate_into_bitmap_u8( + &self, + tile: &ResultTile, + bitmap: &mut [u8], + ) -> Result<(), EvaluatePredicateError> { + self.evaluate_into_bitmap::(tile, bitmap) + } + + fn evaluate_into_bitmap_u64( + &self, + tile: &ResultTile, + bitmap: &mut [u64], + ) -> Result<(), EvaluatePredicateError> { + self.evaluate_into_bitmap::(tile, bitmap) + } } fn new_query_predicates_ffi( diff --git a/tiledb/sm/query/legacy/reader.cc b/tiledb/sm/query/legacy/reader.cc index 36dbfb111a1..cd624b37422 100644 --- a/tiledb/sm/query/legacy/reader.cc +++ b/tiledb/sm/query/legacy/reader.cc @@ -246,9 +246,15 @@ Status Reader::dowork() { auto timer_se = stats_->start_timer("dowork"); // Check that the query condition is valid. - if (condition_.has_value()) { - RETURN_NOT_OK(condition_->check(array_schema_)); + if (predicates_.condition_.has_value()) { + RETURN_NOT_OK(predicates_.condition_->check(array_schema_)); } +#ifdef HAVE_RUST + if (predicates_.datafusion_.has_value()) { + return logger_->status(Status_ReaderError( + "tiledb_query_add_predicate is not supported for this query")); + } +#endif if (buffers_.count(constants::delete_timestamps) != 0) { return logger_->status( @@ -357,8 +363,8 @@ Status Reader::load_initial_data() { RETURN_CANCEL_OR_ERROR(generate_timestamped_conditions()); // Make a list of dim/attr that will be loaded for query condition. - if (condition_.has_value()) { - qc_loaded_attr_names_set_.merge(condition_->field_names()); + if (predicates_.condition_.has_value()) { + qc_loaded_attr_names_set_.merge(predicates_.condition_->field_names()); } for (auto delete_and_update_condition : delete_and_update_conditions_) { qc_loaded_attr_names_set_.merge(delete_and_update_condition.field_names()); @@ -382,7 +388,8 @@ Status Reader::apply_query_condition( std::vector& result_tiles, Subarray& subarray, uint64_t stride) { - if ((!condition_.has_value() && delete_and_update_conditions_.empty()) || + if ((!predicates_.has_predicates() && + delete_and_update_conditions_.empty()) || result_cell_slabs.empty()) { return Status::Ok(); } @@ -407,8 +414,8 @@ Status Reader::apply_query_condition( stride = 1; QueryCondition::Params params(query_memory_tracker_, array_schema_); - if (condition_.has_value()) { - RETURN_NOT_OK(condition_->apply( + if (predicates_.condition_.has_value()) { + RETURN_NOT_OK(predicates_.condition_->apply( params, fragment_metadata_, result_cell_slabs, stride)); } @@ -2243,7 +2250,7 @@ tuple> Reader::fill_dense_coords( // Query conditions mutate the result cell slabs to filter attributes. // This path does not use result cell slabs, which will fill coordinates // for cells that should be filtered out. - if (condition_.has_value()) { + if (predicates_.has_predicates()) { return { logger_->status(Status_ReaderError( "Cannot read dense coordinates; dense coordinate " diff --git a/tiledb/sm/query/query_condition.cc b/tiledb/sm/query/query_condition.cc index 2b5d6e43a7f..e0a8e6b97bb 100644 --- a/tiledb/sm/query/query_condition.cc +++ b/tiledb/sm/query/query_condition.cc @@ -2944,6 +2944,28 @@ uint64_t QueryCondition::condition_index() const { return condition_index_; } +std::unordered_set QueryPredicates::field_names() const { +#ifndef HAVE_RUST + if (condition_.has_value()) { + return condition_.value().field_names(); + } else { + return {}; + } +#else + std::unordered_set ret; + if (condition_.has_value()) { + ret = condition_.value().field_names(); + } + if (datafusion_.has_value()) { + const auto dffields = datafusion_.value()->field_names(); + for (const auto& rstring : dffields) { + ret.insert(std::string(rstring.begin(), rstring.end())); + } + } + return ret; +#endif +} + // Explicit template instantiations. template Status QueryCondition::apply_sparse( const QueryCondition::Params&, const ResultTile&, std::span); diff --git a/tiledb/sm/query/query_condition.h b/tiledb/sm/query/query_condition.h index 81af63c03c9..da8a5fec21b 100644 --- a/tiledb/sm/query/query_condition.h +++ b/tiledb/sm/query/query_condition.h @@ -42,7 +42,6 @@ #include "tiledb/sm/query/ast/query_ast.h" #ifdef HAVE_RUST -#include "tiledb/oxidize/arrow.h" #include "tiledb/oxidize/rust.h" #endif @@ -727,6 +726,22 @@ struct QueryPredicates { */ std::optional> datafusion_; #endif + + /** + * @return true if there are any predicates to apply + */ + bool has_predicates() const { +#ifndef HAVE_RUST + return condition_.has_value(); +#else + return condition_.has_value() || datafusion_.has_value(); +#endif + } + + /** + * @return a set of all unique field names used in the predicates + */ + std::unordered_set field_names() const; }; } // namespace sm diff --git a/tiledb/sm/query/readers/dense_reader.cc b/tiledb/sm/query/readers/dense_reader.cc index 1c3023b6edb..1d40c16df7e 100644 --- a/tiledb/sm/query/readers/dense_reader.cc +++ b/tiledb/sm/query/readers/dense_reader.cc @@ -152,9 +152,15 @@ Status DenseReader::dowork() { auto timer_se = stats_->start_timer("dowork"); // Check that the query condition is valid. - if (condition_.has_value()) { - RETURN_NOT_OK(condition_->check(array_schema_)); + if (predicates_.condition_.has_value()) { + RETURN_NOT_OK(predicates_.condition_->check(array_schema_)); } +#ifdef HAVE_RUST + if (predicates_.datafusion_.has_value()) { + throw DenseReaderException( + "tiledb_query_add_predicate is not supported for dense array queries"); + } +#endif get_dim_attr_stats(); @@ -308,8 +314,8 @@ Status DenseReader::dense_read() { } // Compute attribute names to load and copy. - if (condition_.has_value()) { - qc_loaded_attr_names_set_ = condition_->field_names(); + if (predicates_.has_predicates()) { + qc_loaded_attr_names_set_ = predicates_.field_names(); } qc_loaded_attr_names_.clear(); qc_loaded_attr_names_.reserve(qc_loaded_attr_names_set_.size()); @@ -352,7 +358,7 @@ Status DenseReader::dense_read() { uint64_t subarray_start_cell = 0; uint64_t subarray_end_cell = 0; std::vector qc_result( - !condition_.has_value() ? 0 : subarray.cell_num(), 1); + !predicates_.has_predicates() ? 0 : subarray.cell_num(), 1); // Keep track of the current var buffer sizes. std::map var_buffer_sizes; @@ -629,7 +635,7 @@ void DenseReader::init_read_state() { qc_coords_mode_ = config_.get("sm.query.dense.qc_coords_mode", Config::must_find); - if (qc_coords_mode_ && !condition_.has_value()) { + if (qc_coords_mode_ && !predicates_.condition_.has_value()) { throw DenseReaderException( "sm.query.dense.qc_coords_mode requires a query condition"); } @@ -1035,7 +1041,7 @@ Status DenseReader::apply_query_condition( auto timer_se = stats_->start_timer("apply_query_condition"); auto& result_space_tiles = iteration_tile_data->result_space_tiles(); - if (condition_.has_value()) { + if (predicates_.has_predicates()) { // Compute the result of the query condition. std::vector qc_names; qc_names.reserve(condition_names.size()); @@ -1147,7 +1153,7 @@ Status DenseReader::apply_query_condition( *(fragment_metadata_[frag_domains[i].fid()] ->array_schema() .get())); - throw_if_not_ok(condition_->apply_dense( + throw_if_not_ok(predicates_.condition_->apply_dense( params, result_space_tile.result_tile(frag_domains[i].fid()), start, @@ -1723,7 +1729,8 @@ Status DenseReader::copy_fixed_tiles( } // Apply query condition results to this slab. - if (condition_.has_value() && result_space_tile.qc_filtered_results()) { + if (predicates_.has_predicates() && + result_space_tile.qc_filtered_results()) { for (uint64_t c = 0; c < iter.cell_slab_length(); c++) { if (!(qc_result[c + cell_offset] & 0x1)) { memcpy( @@ -1901,7 +1908,8 @@ Status DenseReader::copy_offset_tiles( } } - if (condition_.has_value() && result_space_tile.qc_filtered_results()) { + if (predicates_.has_predicates() && + result_space_tile.qc_filtered_results()) { // Apply query condition results to this slab. for (uint64_t c = 0; c < iter.cell_slab_length(); c++) { if (!(qc_result[c + cell_offset] & 0x1)) { @@ -2076,7 +2084,7 @@ Status DenseReader::aggregate_tiles( } std::vector aggregate_bitmap(iter.cell_slab_length(), 1); - if (condition_.has_value()) { + if (predicates_.has_predicates()) { memcpy( aggregate_bitmap.data(), qc_result.data() + cell_offset, diff --git a/tiledb/sm/query/readers/ordered_dim_label_reader.cc b/tiledb/sm/query/readers/ordered_dim_label_reader.cc index f35a424e8d8..4b791572a4a 100644 --- a/tiledb/sm/query/readers/ordered_dim_label_reader.cc +++ b/tiledb/sm/query/readers/ordered_dim_label_reader.cc @@ -120,7 +120,7 @@ OrderedDimLabelReader::OrderedDimLabelReader( "Cannot initialize ordered dim label reader; Subarray is set"); } - if (condition_.has_value()) { + if (predicates_.has_predicates()) { throw OrderedDimLabelReaderException( "Ordered dimension label reader cannot process query condition"); } diff --git a/tiledb/sm/query/readers/reader_base.cc b/tiledb/sm/query/readers/reader_base.cc index 7cdb1230d5e..2d8f6e0b68e 100644 --- a/tiledb/sm/query/readers/reader_base.cc +++ b/tiledb/sm/query/readers/reader_base.cc @@ -76,7 +76,7 @@ ReaderBase::ReaderBase( stats::Stats* stats, shared_ptr logger, StrategyParams& params) : StrategyBase(stats, logger, params) , memory_tracker_(params.query_memory_tracker()) - , condition_(params.condition()) + , predicates_(params.predicates()) , user_requested_timestamps_(false) , deletes_consolidation_no_purge_( buffers_.count(constants::delete_timestamps) != 0) diff --git a/tiledb/sm/query/readers/reader_base.h b/tiledb/sm/query/readers/reader_base.h index 0242dc4c11b..b5d70e95717 100644 --- a/tiledb/sm/query/readers/reader_base.h +++ b/tiledb/sm/query/readers/reader_base.h @@ -245,8 +245,8 @@ class ReaderBase : public StrategyBase { /** The query's memory tracker. */ shared_ptr memory_tracker_; - /** The query condition. */ - std::optional& condition_; + /** User predicates */ + QueryPredicates& predicates_; /** * The delete and update conditions. diff --git a/tiledb/sm/query/readers/sparse_global_order_reader.cc b/tiledb/sm/query/readers/sparse_global_order_reader.cc index 521102423e0..0997ca14ffc 100644 --- a/tiledb/sm/query/readers/sparse_global_order_reader.cc +++ b/tiledb/sm/query/readers/sparse_global_order_reader.cc @@ -262,8 +262,8 @@ Status SparseGlobalOrderReader::dowork() { stats_->add_counter("loop_num", 1); // Check that the query condition is valid. - if (condition_.has_value()) { - throw_if_not_ok(condition_->check(array_schema_)); + if (predicates_.condition_.has_value()) { + throw_if_not_ok(predicates_.condition_->check(array_schema_)); } get_dim_attr_stats(); diff --git a/tiledb/sm/query/readers/sparse_index_reader_base.cc b/tiledb/sm/query/readers/sparse_index_reader_base.cc index 6a626b3503b..e12dff5c856 100644 --- a/tiledb/sm/query/readers/sparse_index_reader_base.cc +++ b/tiledb/sm/query/readers/sparse_index_reader_base.cc @@ -50,6 +50,10 @@ #include +#ifdef HAVE_RUST +#include "tiledb/oxidize/query_predicates.h" +#endif + namespace tiledb::sm { class SparseIndexReaderBaseException : public StatusException { @@ -141,7 +145,7 @@ uint64_t SparseIndexReaderBase::available_memory() { bool SparseIndexReaderBase::has_post_deduplication_conditions( FragmentMetadata& frag_meta) { - return frag_meta.has_delete_meta() || condition_.has_value() || + return frag_meta.has_delete_meta() || predicates_.has_predicates() || (!delete_and_update_conditions_.empty() && !deletes_consolidation_no_purge_); } @@ -248,8 +252,8 @@ Status SparseIndexReaderBase::load_initial_data() { } // Make a list of dim/attr that will be loaded for query condition. - if (condition_.has_value()) { - for (auto& name : condition_->field_names()) { + if (predicates_.has_predicates()) { + for (auto& name : predicates_.field_names()) { if (!array_schema_.is_dim(name) || !include_coords_) { qc_loaded_attr_names_set_.insert(name); } @@ -610,7 +614,7 @@ void SparseIndexReaderBase::apply_query_condition( std::vector& result_tiles) { auto timer_se = stats_->start_timer("apply_query_condition"); - if (condition_.has_value() || !delete_and_update_conditions_.empty() || + if (predicates_.has_predicates() || !delete_and_update_conditions_.empty() || use_timestamps_) { // Process all tiles in parallel. throw_if_not_ok(parallel_for( @@ -656,16 +660,41 @@ void SparseIndexReaderBase::apply_query_condition( } // Compute the result of the query condition for this tile. - if (condition_.has_value()) { + if (predicates_.condition_.has_value()) { QueryCondition::Params params( query_memory_tracker_, *(frag_meta->array_schema().get())); - throw_if_not_ok(condition_->apply_sparse( + throw_if_not_ok(predicates_.condition_->apply_sparse( params, *rt, rt->post_dedup_bitmap())); if (array_schema_.allows_dups()) { rt->count_cells(); } } +#ifdef HAVE_RUST + if (predicates_.datafusion_.has_value()) { + rust::Slice bitmap( + rt->post_dedup_bitmap().data(), rt->post_dedup_bitmap().size()); + static_assert( + std::is_same_v || + std::is_same_v); + try { + if constexpr (std::is_same_v) { + predicates_.datafusion_.value()->evaluate_into_bitmap_u8( + *rt, bitmap); + } else { + predicates_.datafusion_.value()->evaluate_into_bitmap_u64( + *rt, bitmap); + } + } catch (const rust::Error& e) { + throw SparseIndexReaderBaseException( + "Error evaluating expression: " + std::string(e.what())); + } + if (array_schema_.allows_dups()) { + rt->count_cells(); + } + } +#endif + // Apply delete conditions. if (!delete_and_update_conditions_.empty()) { // Allocate delete condition idx vector if required. This vector diff --git a/tiledb/sm/query/readers/sparse_unordered_with_dups_reader.cc b/tiledb/sm/query/readers/sparse_unordered_with_dups_reader.cc index 153ba11a2f7..41a6d6e9035 100644 --- a/tiledb/sm/query/readers/sparse_unordered_with_dups_reader.cc +++ b/tiledb/sm/query/readers/sparse_unordered_with_dups_reader.cc @@ -121,8 +121,8 @@ Status SparseUnorderedWithDupsReader::dowork() { } // Check that the query condition is valid. - if (condition_.has_value()) { - throw_if_not_ok(condition_->check(array_schema_)); + if (predicates_.condition_.has_value()) { + throw_if_not_ok(predicates_.condition_->check(array_schema_)); } get_dim_attr_stats(); diff --git a/tiledb/sm/query/strategy_base.h b/tiledb/sm/query/strategy_base.h index ff845d01753..a28a0d2cee2 100644 --- a/tiledb/sm/query/strategy_base.h +++ b/tiledb/sm/query/strategy_base.h @@ -166,6 +166,10 @@ class StrategyParams { return predicates_.condition_; } + inline QueryPredicates& predicates() { + return predicates_; + } + /** Return the default channel aggregates. */ inline DefaultChannelAggregates& default_channel_aggregates() { return default_channel_aggregates_; From 48195cf16119357a7feb1301c220790ab8f5fbbe Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Mon, 17 Nov 2025 16:09:14 -0500 Subject: [PATCH 50/52] unit_query_condition and API test both pass --- test/src/unit-query-add-predicate.cc | 17 +- tiledb/oxidize/CMakeLists.txt | 1 + tiledb/oxidize/Cargo.lock | 7 + tiledb/oxidize/expr/src/logical_expr.rs | 7 +- tiledb/oxidize/query-predicates/src/lib.rs | 215 ++++++++++++++---- .../unit-query-condition/Cargo.toml | 1 + .../unit-query-condition/src/lib.rs | 36 +++ .../test-support/query-condition/Cargo.toml | 6 + .../test-support/query-condition/src/lib.rs | 2 +- .../src/{datafusion.rs => logical_expr.rs} | 24 +- .../test-support/result-tile/src/lib.rs | 38 +++- tiledb/sm/query/query.cc | 6 + tiledb/sm/query/test/unit_query_condition.cc | 23 +- 13 files changed, 282 insertions(+), 101 deletions(-) rename tiledb/oxidize/test-support/query-condition/src/{datafusion.rs => logical_expr.rs} (97%) diff --git a/test/src/unit-query-add-predicate.cc b/test/src/unit-query-add-predicate.cc index 917345da81a..d45e48a82b2 100644 --- a/test/src/unit-query-add-predicate.cc +++ b/test/src/unit-query-add-predicate.cc @@ -908,6 +908,15 @@ TEST_CASE_METHOD( } } +/** + * Test combinations of query conditions and predicates. + * + * While predicates are the more user-friendly option, query conditions are + * still around for historical reasons, and may also be more performant. + * Whatever the case, we don't explicity disable combining these features, + * and so we should observe here that they mix well and the cells which come out + * of a query using both is the logical AND of both types of predicates. + */ TEST_CASE_METHOD( QueryAddPredicateFx, "Query add predicate with query condition", @@ -968,11 +977,9 @@ TEST_CASE_METHOD( const auto predresult = query_array(array_name, query_order, {"a IS NULL"}); CHECK(predresult == expect_a_is_null); - // NB: since we re-write the query condition into datafusion - // it also will not support this - REQUIRE_THROWS_WITH( - query_array(array_name, query_order, {"a IS NULL"}, kwargs), - matchEnumerationNotSupported()); + const auto andresult = + query_array(array_name, query_order, {"a IS NULL"}, kwargs); + CHECK(andresult == expect_a_and_e_are_null); } SECTION("Enumeration in predicate") { diff --git a/tiledb/oxidize/CMakeLists.txt b/tiledb/oxidize/CMakeLists.txt index 3266675d444..0d94656ac25 100644 --- a/tiledb/oxidize/CMakeLists.txt +++ b/tiledb/oxidize/CMakeLists.txt @@ -79,6 +79,7 @@ oxidize( EXPORT arrow cxx-interface + query-predicates test-support-cxx-interface ) diff --git a/tiledb/oxidize/Cargo.lock b/tiledb/oxidize/Cargo.lock index ce097d55352..ac0c25b3733 100644 --- a/tiledb/oxidize/Cargo.lock +++ b/tiledb/oxidize/Cargo.lock @@ -3029,8 +3029,14 @@ version = "0.1.0" dependencies = [ "anyhow", "cxx", + "datafusion", + "itertools", + "num-traits", + "thiserror 2.0.17", + "tiledb-arrow", "tiledb-common", "tiledb-cxx-interface", + "tiledb-datatype", "tiledb-test-support-cxx-interface", ] @@ -3078,6 +3084,7 @@ dependencies = [ "tiledb-cxx-interface", "tiledb-expr", "tiledb-pod", + "tiledb-query-predicates", "tiledb-test-array-schema", "tiledb-test-ffi", "tiledb-test-query-condition", diff --git a/tiledb/oxidize/expr/src/logical_expr.rs b/tiledb/oxidize/expr/src/logical_expr.rs index 5b10d953a74..ec9573e1398 100644 --- a/tiledb/oxidize/expr/src/logical_expr.rs +++ b/tiledb/oxidize/expr/src/logical_expr.rs @@ -5,11 +5,8 @@ use datafusion::common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor use datafusion::logical_expr::Expr; /// Returns a list of the names of the columns used in this expression. -pub fn columns(expr: &Expr) -> Vec { - expr.column_refs() - .into_iter() - .map(|c| c.name.clone()) - .collect() +pub fn columns(expr: &Expr) -> impl Iterator { + expr.column_refs().into_iter().map(|c| c.name.as_ref()) } /// Returns true if `expr` contains aggregate functions and false otherwise. diff --git a/tiledb/oxidize/query-predicates/src/lib.rs b/tiledb/oxidize/query-predicates/src/lib.rs index 675cf9498e6..41649369fa8 100644 --- a/tiledb/oxidize/query-predicates/src/lib.rs +++ b/tiledb/oxidize/query-predicates/src/lib.rs @@ -16,9 +16,12 @@ mod ffi { #[cxx_name = "new_query_predicates"] fn new_query_predicates_ffi(schema: &ArraySchema) -> Result>; - fn field_names(&self) -> Vec; + fn compile(&mut self) -> Result<()>; - fn add_predicate(&mut self, expr: &str) -> Result<()>; + unsafe fn field_names<'a>(&'a self) -> Vec<&'a str>; + + #[cxx_name = "add_predicate"] + fn add_text_predicate(&mut self, expr: &str) -> Result<()>; fn evaluate_into_bitmap_u8(&self, tile: &ResultTile, bitmap: &mut [u8]) -> Result<()>; fn evaluate_into_bitmap_u64(&self, tile: &ResultTile, bitmap: &mut [u64]) -> Result<()>; @@ -27,7 +30,7 @@ mod ffi { use std::sync::Arc; -use arrow::datatypes::DataType; +use arrow::datatypes::{DataType, Schema as ArrowSchema}; use datafusion::common::tree_node::TreeNode; use datafusion::common::{DFSchema, ScalarValue}; use datafusion::execution::context::ExecutionProps; @@ -37,6 +40,7 @@ use datafusion::logical_expr::{Expr, ExprSchemable}; use datafusion::physical_plan::{ColumnarValue, PhysicalExpr}; use itertools::Itertools; use num_traits::Zero; +use tiledb_arrow::schema::WhichSchema; use tiledb_cxx_interface::sm::array_schema::ArraySchema; use tiledb_cxx_interface::sm::query::readers::ResultTile; @@ -58,8 +62,12 @@ pub enum AddPredicateError { TypeCoercion(#[source] datafusion::common::DataFusionError), #[error("Output type error: {0}")] OutputType(#[source] datafusion::common::DataFusionError), +} + +#[derive(Debug, thiserror::Error)] +pub enum CompileError { #[error("Expression compile error: {0}")] - Compile(#[source] datafusion::common::DataFusionError), + PhysicalExpr(#[source] datafusion::common::DataFusionError), } #[derive(Debug, thiserror::Error)] @@ -70,35 +78,112 @@ pub enum EvaluatePredicateError { Evaluate(#[source] datafusion::common::DataFusionError), } -pub struct QueryPredicates { +/// Holds state to parse, analyze and evaluate predicates of a TileDB query. +pub enum QueryPredicates { + Build(Builder), + Evaluate(Evaluator), +} + +impl QueryPredicates { + pub fn new(schema: &ArraySchema) -> Result { + Ok(Self::Build(Builder::new(schema, WhichSchema::View)?)) + } + + pub fn add_text_predicate(&mut self, expr: &str) -> Result<(), AddPredicateError> { + match self { + Self::Build(builder) => builder.add_text_predicate(expr), + Self::Evaluate(_) => todo!(), + } + } + + pub fn compile(&mut self) -> Result<(), CompileError> { + match self { + Self::Build(builder) => { + *self = Self::Evaluate(builder.compile()?); + Ok(()) + } + Self::Evaluate(_) => todo!(), + } + } + + pub fn field_names(&self) -> Vec<&str> { + match self { + Self::Build(builder) => builder.field_names(), + Self::Evaluate(evaluator) => evaluator.field_names(), + } + } + + pub fn evaluate(&self, tile: &ResultTile) -> Result { + match self { + Self::Build(_) => todo!(), + Self::Evaluate(evaluator) => evaluator.evaluate(tile), + } + } + + pub fn evaluate_into_bitmap( + &self, + tile: &ResultTile, + bitmap: &mut [T], + ) -> Result<(), EvaluatePredicateError> + where + T: Copy + Zero, + { + match self { + Self::Build(_) => todo!(), + Self::Evaluate(evaluator) => evaluator.evaluate_into_bitmap(tile, bitmap), + } + } + + fn evaluate_into_bitmap_u8( + &self, + tile: &ResultTile, + bitmap: &mut [u8], + ) -> Result<(), EvaluatePredicateError> { + self.evaluate_into_bitmap::(tile, bitmap) + } + + fn evaluate_into_bitmap_u64( + &self, + tile: &ResultTile, + bitmap: &mut [u64], + ) -> Result<(), EvaluatePredicateError> { + self.evaluate_into_bitmap::(tile, bitmap) + } +} + +/// Structure which accumulates predicates. +pub struct Builder { + /// DataFusion evaluation context. dfsession: SessionContext, + /// Array schema mapped onto DataFusion data types. dfschema: DFSchema, + /// Logical syntax tree representations of the predicates. logical_exprs: Vec, - predicate: Option>, } -impl QueryPredicates { - pub fn new(schema: &ArraySchema) -> Result { - let (arrow_schema, _) = - tiledb_arrow::schema::to_arrow(schema, tiledb_arrow::schema::WhichSchema::View)?; +impl Builder { + pub fn new( + schema: &ArraySchema, + which: WhichSchema, + ) -> Result { + let (arrow_schema, _) = tiledb_arrow::schema::to_arrow(schema, which)?; let dfschema = { // SAFETY: this only errors if the names are not unique, // which they will be because `ArraySchema` requires it DFSchema::try_from(arrow_schema).unwrap() }; - Ok(QueryPredicates { + Ok(Builder { dfsession: SessionContext::from( SessionStateBuilder::new_with_default_features().build(), ), dfschema, logical_exprs: vec![], - predicate: None, }) } - /// Returns a list of all of the field names used in all of the predicates - pub fn field_names(&self) -> Vec { + /// Returns a list of all of the field names used in all of the predicates. + pub fn field_names(&self) -> Vec<&str> { self.logical_exprs .iter() .flat_map(tiledb_expr::logical_expr::columns) @@ -106,7 +191,7 @@ impl QueryPredicates { .collect() } - pub fn add_predicate(&mut self, expr: &str) -> Result<(), AddPredicateError> { + pub fn add_text_predicate(&mut self, expr: &str) -> Result<(), AddPredicateError> { let parsed_expr = self .dfsession .parse_sql_expr(expr, &self.dfschema) @@ -121,6 +206,10 @@ impl QueryPredicates { .map(|t| t.data) .map_err(AddPredicateError::TypeCoercion)?; + self.add_predicate(logical_expr) + } + + pub fn add_predicate(&mut self, logical_expr: Expr) -> Result<(), AddPredicateError> { let output_type = logical_expr .get_type(&self.dfschema) .map_err(AddPredicateError::OutputType)?; @@ -129,23 +218,67 @@ impl QueryPredicates { } else if tiledb_expr::logical_expr::has_aggregate_functions(&logical_expr) { return Err(AddPredicateError::ContainsAggregateFunctions); } - let physical_expr = datafusion::physical_expr::create_physical_expr( - &logical_expr, - &self.dfschema, - &ExecutionProps::new(), - ) - .map_err(AddPredicateError::Compile)?; - self.logical_exprs.push(logical_expr); - self.predicate = Some(datafusion::physical_expr::conjunction( - self.predicate - .take() - .into_iter() - .chain(std::iter::once(physical_expr)), - )); + Ok(()) } + pub fn compile(&self) -> Result { + let evaluation_schema = { + let projection_fields = self + .field_names() + .iter() + .map(|fname| self.dfschema.as_arrow().field_with_name(fname)) + .process_results(|fs| fs.cloned().collect::>()); + + let projection_fields = { + // SAFETY: all field names have already been validated as part of the schema + projection_fields.unwrap() + }; + + // SAFETY: this only errors if the names are not unique, + // which they will be because `self.field_names()` produces unique field names + DFSchema::try_from(ArrowSchema::new(projection_fields)).unwrap() + }; + let predicate = { + let execution_props = ExecutionProps::new(); + self.logical_exprs + .iter() + .map(|e| { + datafusion::physical_expr::create_physical_expr( + e, + &evaluation_schema, + &execution_props, + ) + .map_err(CompileError::PhysicalExpr) + }) + .process_results(|es| datafusion::physical_expr::conjunction(es))? + }; + Ok(Evaluator { + dfschema: evaluation_schema, + predicate, + }) + } +} + +pub struct Evaluator { + /// Array schema mapped onto DataFusion data types; this is a projection of the full schema + /// consisting only of the fields which are used to evaluate `self.predicate`. + dfschema: DFSchema, + /// Expression evaluator which evaluates all predicates as a conjunction. + predicate: Arc, +} + +impl Evaluator { + /// Returns a list of all of the field names used in all of the predicates. + pub fn field_names(&self) -> Vec<&str> { + self.dfschema + .fields() + .iter() + .map(|f| f.name().as_ref()) + .collect::>() + } + pub fn evaluate(&self, tile: &ResultTile) -> Result { let rb = unsafe { // SAFETY: "This function is safe to call as long as the returned @@ -153,14 +286,12 @@ impl QueryPredicates { // The RecordBatch only lives in this stack frame, so we will follow this contract. tiledb_arrow::record_batch::to_record_batch(self.dfschema.inner(), tile)? }; - if let Some(p) = self.predicate.as_ref() { - Ok(p.evaluate(&rb).map_err(EvaluatePredicateError::Evaluate)?) - } else { - Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(true)))) - } + self.predicate + .evaluate(&rb) + .map_err(EvaluatePredicateError::Evaluate) } - fn evaluate_into_bitmap( + pub fn evaluate_into_bitmap( &self, tile: &ResultTile, bitmap: &mut [T], @@ -208,22 +339,6 @@ impl QueryPredicates { } } } - - fn evaluate_into_bitmap_u8( - &self, - tile: &ResultTile, - bitmap: &mut [u8], - ) -> Result<(), EvaluatePredicateError> { - self.evaluate_into_bitmap::(tile, bitmap) - } - - fn evaluate_into_bitmap_u64( - &self, - tile: &ResultTile, - bitmap: &mut [u64], - ) -> Result<(), EvaluatePredicateError> { - self.evaluate_into_bitmap::(tile, bitmap) - } } fn new_query_predicates_ffi( diff --git a/tiledb/oxidize/staticlibs/unit-query-condition/Cargo.toml b/tiledb/oxidize/staticlibs/unit-query-condition/Cargo.toml index b56b9e155c6..27a434af806 100644 --- a/tiledb/oxidize/staticlibs/unit-query-condition/Cargo.toml +++ b/tiledb/oxidize/staticlibs/unit-query-condition/Cargo.toml @@ -14,6 +14,7 @@ tiledb-cxx-interface = { workspace = true } tiledb-expr = { workspace = true } tiledb-pod = { workspace = true } tiledb-common = { workspace = true } +tiledb-query-predicates = { workspace = true } tiledb-test-array-schema = { workspace = true } tiledb-test-cells = { workspace = true } tiledb-test-ffi = { workspace = true } diff --git a/tiledb/oxidize/staticlibs/unit-query-condition/src/lib.rs b/tiledb/oxidize/staticlibs/unit-query-condition/src/lib.rs index af1b74781be..4539a839b58 100644 --- a/tiledb/oxidize/staticlibs/unit-query-condition/src/lib.rs +++ b/tiledb/oxidize/staticlibs/unit-query-condition/src/lib.rs @@ -1,5 +1,6 @@ pub use tiledb_arrow; pub use tiledb_expr; +pub use tiledb_query_predicates; #[cxx::bridge] mod ffi { @@ -14,6 +15,15 @@ mod ffi { type ResultTile = tiledb_cxx_interface::sm::query::readers::ResultTile; } + #[namespace = "tiledb::test::query_condition_datafusion"] + extern "Rust" { + fn evaluate_as_datafusion( + array_schema: &ArraySchema, + query_condition: &ASTNode, + tile: &ResultTile, + ) -> Result>; + } + #[namespace = "tiledb::test::query_condition_datafusion"] unsafe extern "C++" { include!("tiledb/oxidize/staticlibs/unit-query-condition/cc/oxidize.h"); @@ -48,13 +58,39 @@ use arrow::datatypes::{DataType as ArrowDataType, Field as ArrowField, Schema as use arrow::record_batch::RecordBatch; use proptest::prelude::*; use proptest::test_runner::{TestCaseError, TestRunner}; +use tiledb_arrow::schema::WhichSchema; use tiledb_common::query::condition::QueryConditionExpr; use tiledb_common::query::condition::strategy::Parameters as QueryConditionParameters; +use tiledb_cxx_interface::sm::array_schema::ArraySchema; +use tiledb_cxx_interface::sm::query::ast::ASTNode; +use tiledb_cxx_interface::sm::query::readers::ResultTile; use tiledb_pod::array::schema::SchemaData; use tiledb_pod::array::schema::strategy::Requirements as SchemaRequirements; use tiledb_test_cells::strategy::{CellsParameters, CellsStrategySchema, SchemaWithDomain}; use tiledb_test_cells::{Cells, FieldData}; +fn evaluate_as_datafusion( + array_schema: &ArraySchema, + query_condition: &ASTNode, + tile: &ResultTile, +) -> anyhow::Result> { + let logical_expr = tiledb_test_query_condition::logical_expr::to_datafusion( + array_schema, + WhichSchema::Storage, + query_condition, + )?; + + let mut qbuilder = tiledb_query_predicates::Builder::new(array_schema, WhichSchema::Storage)?; + qbuilder.add_predicate(logical_expr)?; + + let qeval = qbuilder.compile()?; + + let mut bitmap = vec![1u8; tile.cell_num() as usize]; + qeval.evaluate_into_bitmap::(tile, &mut bitmap)?; + + Ok(bitmap) +} + fn instance_query_condition_datafusion( schema: &SchemaData, cells: &Cells, diff --git a/tiledb/oxidize/test-support/query-condition/Cargo.toml b/tiledb/oxidize/test-support/query-condition/Cargo.toml index 2c213c353b0..a20e5b0a040 100644 --- a/tiledb/oxidize/test-support/query-condition/Cargo.toml +++ b/tiledb/oxidize/test-support/query-condition/Cargo.toml @@ -7,6 +7,12 @@ version = { workspace = true } [dependencies] anyhow = { workspace = true } cxx = { workspace = true } +datafusion = { workspace = true } +itertools = { workspace = true } +num-traits = { workspace = true } +thiserror = { workspace = true } +tiledb-arrow = { workspace = true } tiledb-common = { workspace = true } tiledb-cxx-interface = { workspace = true } +tiledb-datatype = { workspace = true } tiledb-test-support-cxx-interface = { workspace = true } diff --git a/tiledb/oxidize/test-support/query-condition/src/lib.rs b/tiledb/oxidize/test-support/query-condition/src/lib.rs index a66df036967..8f90cb2445c 100644 --- a/tiledb/oxidize/test-support/query-condition/src/lib.rs +++ b/tiledb/oxidize/test-support/query-condition/src/lib.rs @@ -4,8 +4,8 @@ //! This enables property-based testing against arbitrary query conditions //! using the strategies we have already written in `tiledb_common`. -mod datafusion; mod enums; +pub mod logical_expr; use tiledb_common::query::condition::*; use tiledb_cxx_interface::sm::query::ast::ASTNode; diff --git a/tiledb/oxidize/test-support/query-condition/src/datafusion.rs b/tiledb/oxidize/test-support/query-condition/src/logical_expr.rs similarity index 97% rename from tiledb/oxidize/test-support/query-condition/src/datafusion.rs rename to tiledb/oxidize/test-support/query-condition/src/logical_expr.rs index 4377ac58825..82ed92b3d1e 100644 --- a/tiledb/oxidize/test-support/query-condition/src/datafusion.rs +++ b/tiledb/oxidize/test-support/query-condition/src/logical_expr.rs @@ -22,9 +22,7 @@ use tiledb_cxx_interface::sm::misc::ByteVecValue; use tiledb_cxx_interface::sm::query::ast::ASTNode; use tiledb_datatype::apply_physical_type; -use crate::logical_expr::LogicalExpr; - -/// An error constructing a [LogicalExpr] for a query condition. +/// An error constructing an [Expr] for a query condition. #[derive(Debug, thiserror::Error)] pub enum Error { #[error("Query condition expression internal error: {0}")] @@ -417,7 +415,7 @@ fn combination_ast_to_binary_expr( ) -> Result { let mut level = query_condition .children() - .map(|ast| to_datafusion_impl(schema, which, ast)) + .map(|ast| to_datafusion(schema, which, ast)) .collect::, _>>()?; while level.len() != 1 { @@ -447,7 +445,7 @@ fn combination_ast_to_binary_expr( Ok(level.into_iter().next().unwrap()) } -fn to_datafusion_impl( +pub fn to_datafusion( schema: &ArraySchema, which: WhichSchema, query_condition: &ASTNode, @@ -465,7 +463,7 @@ fn to_datafusion_impl( if children.len() != 1 { return Err(InternalError::NotTree(children.len()).into()); } - let negate_arg = to_datafusion_impl(schema, which, children[0])?; + let negate_arg = to_datafusion(schema, which, children[0])?; Ok(!negate_arg) } invalid => Err(InternalError::InvalidCombinationOp(invalid.repr.into()).into()), @@ -541,17 +539,3 @@ fn to_datafusion_impl( } } } - -/// Returns a [LogicalExpr] which represents the same expression -/// as the requested query condition. -pub fn to_datafusion( - schema: &ArraySchema, - which: &WhichSchema, - query_condition: &ASTNode, -) -> Result, Error> { - Ok(Box::new(LogicalExpr(to_datafusion_impl( - schema, - *which, - query_condition, - )?))) -} diff --git a/tiledb/oxidize/test-support/result-tile/src/lib.rs b/tiledb/oxidize/test-support/result-tile/src/lib.rs index e90529b0b7c..9db38668e7e 100644 --- a/tiledb/oxidize/test-support/result-tile/src/lib.rs +++ b/tiledb/oxidize/test-support/result-tile/src/lib.rs @@ -9,7 +9,7 @@ use std::collections::HashMap; use std::ops::Deref; use std::sync::Arc; -use arrow::array::{Array as ArrowArray, GenericListArray, PrimitiveArray}; +use arrow::array::{Array as ArrowArray, GenericListArray, NativeAdapter, PrimitiveArray}; use arrow::buffer::OffsetBuffer; use arrow::datatypes::{Field as ArrowField, Schema as ArrowSchema}; use arrow::record_batch::RecordBatch; @@ -17,6 +17,31 @@ use tiledb_cxx_interface::sm::array_schema::{ArraySchema, CellValNum}; use tiledb_cxx_interface::sm::query::readers::ResultTile; use tiledb_test_cells::{Cells, FieldData, typed_field_data_go}; +/// Associates a native type with an `ArrowDataType` value which has the +/// same corresponding native type +pub trait TypeTraits { + type ArrowPrimitiveType; +} + +macro_rules! type_traits { + ($ty:ty, $primitive_type:ident) => { + impl TypeTraits for $ty { + type ArrowPrimitiveType = arrow::datatypes::$primitive_type; + } + }; +} + +type_traits!(i8, Int8Type); +type_traits!(i16, Int16Type); +type_traits!(i32, Int32Type); +type_traits!(i64, Int64Type); +type_traits!(u8, UInt8Type); +type_traits!(u16, UInt16Type); +type_traits!(u32, UInt32Type); +type_traits!(u64, UInt64Type); +type_traits!(f32, Float32Type); +type_traits!(f64, Float64Type); + /// Packages a `ResultTile` with the buffers which contain the tile data. pub struct PackagedResultTile { /// Buffers underlying the [ResultTile]. @@ -214,14 +239,21 @@ fn cells_to_record_batch(cells: &Cells) -> RecordBatch { fn field_data_to_array(field: &FieldData) -> Arc { typed_field_data_go!( field, - _DT, + DT, cells, - Arc::new(cells.iter().copied().collect::>()) as Arc, + Arc::new( + cells + .iter() + .copied() + .map(NativeAdapter::<
::ArrowPrimitiveType>::from) + .collect::>() + ) as Arc, { let values = cells .iter() .flatten() .copied() + .map(NativeAdapter::<
::ArrowPrimitiveType>::from) .collect::>(); let offsets = OffsetBuffer::::from_lengths(cells.iter().map(|s| s.len())); let cells = GenericListArray::new( diff --git a/tiledb/sm/query/query.cc b/tiledb/sm/query/query.cc index eeb11b78f20..8cb446cd929 100644 --- a/tiledb/sm/query/query.cc +++ b/tiledb/sm/query/query.cc @@ -729,6 +729,12 @@ void Query::init() { fragment_name_)); } +#ifdef HAVE_RUST + if (predicates_.datafusion_.has_value()) { + predicates_.datafusion_.value()->compile(); + } +#endif + // Create the query strategy if querying main array and the Subarray does // not need to be updated. if (!only_dim_label_query() && !subarray_.has_label_ranges()) { diff --git a/tiledb/sm/query/test/unit_query_condition.cc b/tiledb/sm/query/test/unit_query_condition.cc index 26b57a00892..ff48913acf0 100644 --- a/tiledb/sm/query/test/unit_query_condition.cc +++ b/tiledb/sm/query/test/unit_query_condition.cc @@ -5188,33 +5188,22 @@ std::vector instance( const tiledb::sm::ASTNode& ast) { using Asserter = tiledb::test::AsserterRapidcheck; - // set up traditional TileDB evaluation + // evaluate using traditional TileDB evaluation QueryCondition qc_ast(ast.clone()); qc_ast.rewrite_for_schema(array_schema); - - // set up datafusion evaluation - QueryCondition qc_datafusion(ast.clone()); - qc_datafusion.rewrite_for_schema(array_schema); - const bool datafusion_ok = qc_datafusion.rewrite_to_datafusion( - array_schema, tiledb::oxidize::arrow::schema::WhichSchema::Storage); - ASSERTER(datafusion_ok); - - // prepare to evaluate QueryCondition::Params params( tiledb::test::get_test_memory_tracker(), array_schema); std::vector bitmap_ast(tile.cell_num(), 1); - std::vector bitmap_datafusion(tile.cell_num(), 1); - - // evaluate traditional ast const auto status_ast = qc_ast.apply_sparse(params, tile, bitmap_ast); ASSERTER(status_ast.ok()); - // evaluate datafusion - const auto status_datafusion = - qc_datafusion.apply_sparse(params, tile, bitmap_datafusion); - ASSERTER(status_datafusion.ok()); + // evaluate using datafusion + const auto rs_bitmap_datafusion = + evaluate_as_datafusion(array_schema, *qc_ast.ast().get(), tile); + std::vector bitmap_datafusion( + rs_bitmap_datafusion.begin(), rs_bitmap_datafusion.end()); // compare ASSERTER(bitmap_ast == bitmap_datafusion); From 6f7d0801f1e57c54bf542137b004284582fb8b63 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Mon, 17 Nov 2025 17:14:01 -0500 Subject: [PATCH 51/52] self review --- test/src/unit-query-add-predicate.cc | 58 ++++++++++++++++--- tiledb/oxidize/arrow/src/schema.rs | 3 +- tiledb/oxidize/query-predicates/src/lib.rs | 46 ++++++++++++--- .../test-support/result-tile/src/lib.rs | 3 + tiledb/sm/query/query_condition.cc | 28 --------- tiledb/sm/query/query_condition.h | 2 +- 6 files changed, 93 insertions(+), 47 deletions(-) diff --git a/test/src/unit-query-add-predicate.cc b/test/src/unit-query-add-predicate.cc index d45e48a82b2..3754c91381e 100644 --- a/test/src/unit-query-add-predicate.cc +++ b/test/src/unit-query-add-predicate.cc @@ -214,6 +214,13 @@ const Cells expect_e_is_null = make_cells( {"four", "seven", "ten", "thirteen", "sixteen"}, {std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt}); +const Cells expect_a_is_null_and_row_gt_col = make_cells( + {2, 4}, + {1, 1}, + {std::nullopt, std::nullopt}, + {"five", "thirteen"}, + {7, std::nullopt}); + const Cells expect_a_is_null_and_v_starts_with_t = make_cells( {1, 1, 4}, {2, 3, 1}, @@ -593,6 +600,14 @@ TEST_CASE_METHOD( } } +/** + * Tests applying datafusion predicates to sparse global order reader. + * + * NB: `WHERE TRUE` and `WHERE FALSE` and `WHERE NULL` may look silly + * but they exercise the `ColumnarValue::Scalar` branches of evaluation + * which become increasingly important once constant folding is + * introduced. + */ TEST_CASE_METHOD( QueryAddPredicateFx, "Query add predicate sparse global order", @@ -614,6 +629,16 @@ TEST_CASE_METHOD( CHECK(result == INPUT); } + SECTION("WHERE FALSE") { + const auto result = query_array(array_name, query_order, {"FALSE"}); + CHECK(result == Cells()); + } + + SECTION("WHERE NULL") { + const auto result = query_array(array_name, query_order, {"NULL"}); + CHECK(result == Cells()); + } + SECTION("WHERE a IS NOT NULL") { const Cells expect = make_cells( {1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4}, @@ -673,16 +698,33 @@ TEST_CASE_METHOD( } SECTION("WHERE a IS NULL AND row > col") { - const Cells expect = make_cells( - {2, 4}, - {1, 1}, - {std::nullopt, std::nullopt}, - {"five", "thirteen"}, - {7, std::nullopt}); - const auto result = query_array(array_name, query_order, {"a IS NULL", "row > col"}); - CHECK(result == expect); + CHECK(result == expect_a_is_null_and_row_gt_col); + } + + SECTION("WHERE a IS NULL AND TRUE AND row > col") { + const auto result = query_array( + array_name, query_order, {"a IS NULL", "TRUE", "row > col"}); + CHECK(result == expect_a_is_null_and_row_gt_col); + } + + SECTION("WHERE a IS NULL AND row > col AND TRUE") { + const auto result = query_array( + array_name, query_order, {"a IS NULL", "row > col", "TRUE"}); + CHECK(result == expect_a_is_null_and_row_gt_col); + } + + SECTION("WHERE a IS NULL AND FALSE AND row > col") { + const auto result = query_array( + array_name, query_order, {"a IS NULL", "FALSE", "row > col"}); + CHECK(result == Cells()); + } + + SECTION("WHERE a IS NULL AND NULL AND row > col") { + const auto result = query_array( + array_name, query_order, {"a IS NULL", "FALSE", "row > col"}); + CHECK(result == Cells()); } SECTION("WHERE coalesce(a, row) > col") { diff --git a/tiledb/oxidize/arrow/src/schema.rs b/tiledb/oxidize/arrow/src/schema.rs index 093470ea028..ca2addcce48 100644 --- a/tiledb/oxidize/arrow/src/schema.rs +++ b/tiledb/oxidize/arrow/src/schema.rs @@ -159,7 +159,8 @@ pub fn field_arrow_datatype( )); } - // NB: This branch is reached from `session::parse_expr` which requires + // NB: This branch is reached from + // `tiledb_query_predicates::Builder::add_predicate` which requires // a schema in order to parse the text into logical expression. // However, we may not have the enumeration loaded, and without // loading it we don't know the type (since the type is co-located diff --git a/tiledb/oxidize/query-predicates/src/lib.rs b/tiledb/oxidize/query-predicates/src/lib.rs index 41649369fa8..c642626ab92 100644 --- a/tiledb/oxidize/query-predicates/src/lib.rs +++ b/tiledb/oxidize/query-predicates/src/lib.rs @@ -52,6 +52,8 @@ pub enum ParseExprError { #[derive(Debug, thiserror::Error)] pub enum AddPredicateError { + #[error("Query is in progress")] + InvalidState, #[error("Parse error: {0}")] Parse(#[source] datafusion::common::DataFusionError), #[error("Expression is not a predicate: found return type {0}")] @@ -66,12 +68,16 @@ pub enum AddPredicateError { #[derive(Debug, thiserror::Error)] pub enum CompileError { + #[error("Query is in progress")] + InvalidState, #[error("Expression compile error: {0}")] PhysicalExpr(#[source] datafusion::common::DataFusionError), } #[derive(Debug, thiserror::Error)] pub enum EvaluatePredicateError { + #[error("Query has not been started")] + InvalidState, #[error("Data error: {0}")] ResultTile(#[from] tiledb_arrow::record_batch::Error), #[error("Evaluation error: {0}")] @@ -80,7 +86,9 @@ pub enum EvaluatePredicateError { /// Holds state to parse, analyze and evaluate predicates of a TileDB query. pub enum QueryPredicates { + /// Predicates are being added to the query. Build(Builder), + /// The query is being evaluated. Evaluate(Evaluator), } @@ -89,23 +97,29 @@ impl QueryPredicates { Ok(Self::Build(Builder::new(schema, WhichSchema::View)?)) } + /// Parses a text predicate into a logical expression and adds it to the list of predicates to + /// evaluate. + /// + /// This is only valid from the `Build` state. pub fn add_text_predicate(&mut self, expr: &str) -> Result<(), AddPredicateError> { match self { Self::Build(builder) => builder.add_text_predicate(expr), - Self::Evaluate(_) => todo!(), + Self::Evaluate(_) => Err(AddPredicateError::InvalidState), } } + /// Transitions state from `Build` to `Evaluate`. pub fn compile(&mut self) -> Result<(), CompileError> { match self { Self::Build(builder) => { *self = Self::Evaluate(builder.compile()?); Ok(()) } - Self::Evaluate(_) => todo!(), + Self::Evaluate(_) => Err(CompileError::InvalidState), } } + /// Returns a list of unique field names which are used in the predicates. pub fn field_names(&self) -> Vec<&str> { match self { Self::Build(builder) => builder.field_names(), @@ -115,7 +129,7 @@ impl QueryPredicates { pub fn evaluate(&self, tile: &ResultTile) -> Result { match self { - Self::Build(_) => todo!(), + Self::Build(_) => Err(EvaluatePredicateError::InvalidState), Self::Evaluate(evaluator) => evaluator.evaluate(tile), } } @@ -129,7 +143,7 @@ impl QueryPredicates { T: Copy + Zero, { match self { - Self::Build(_) => todo!(), + Self::Build(_) => Err(EvaluatePredicateError::InvalidState), Self::Evaluate(evaluator) => evaluator.evaluate_into_bitmap(tile, bitmap), } } @@ -191,6 +205,8 @@ impl Builder { .collect() } + /// Parses a predicate into a logical expression and adds it to the list of predicates to + /// evaluate. pub fn add_text_predicate(&mut self, expr: &str) -> Result<(), AddPredicateError> { let parsed_expr = self .dfsession @@ -209,11 +225,13 @@ impl Builder { self.add_predicate(logical_expr) } + /// Adds a predicate to the list of predicates to evaluate. pub fn add_predicate(&mut self, logical_expr: Expr) -> Result<(), AddPredicateError> { let output_type = logical_expr .get_type(&self.dfschema) .map_err(AddPredicateError::OutputType)?; - if output_type != DataType::Boolean { + if output_type != DataType::Boolean && output_type != DataType::Null { + // NB: see non-pub DataFusion API `Filter::is_allowed_filter_type` return Err(AddPredicateError::NotAPredicate(output_type)); } else if tiledb_expr::logical_expr::has_aggregate_functions(&logical_expr) { return Err(AddPredicateError::ContainsAggregateFunctions); @@ -223,6 +241,7 @@ impl Builder { Ok(()) } + /// Returns an `Evaluator` which can evaluate the conjunction of all of the predicates. pub fn compile(&self) -> Result { let evaluation_schema = { let projection_fields = self @@ -264,6 +283,9 @@ impl Builder { pub struct Evaluator { /// Array schema mapped onto DataFusion data types; this is a projection of the full schema /// consisting only of the fields which are used to evaluate `self.predicate`. + /// The tiles corresponding to fields in this schema will be converted to [RecordBatch] + /// columns, so to avoid extra conversions (which may allocate memory) we do not + /// want to keep all of the fields here. dfschema: DFSchema, /// Expression evaluator which evaluates all predicates as a conjunction. predicate: Arc, @@ -299,7 +321,9 @@ impl Evaluator { where T: Copy + Zero, { - // TODO: consider not evaluating on cells where the bitmap is already set + // TODO: consider not evaluating on cells where the bitmap is already set. + // This might happen if there is a historical query condition or if there + // is timestamp duplication. let result = self.evaluate(tile)?; match result { @@ -313,13 +337,13 @@ impl Evaluator { bitmap.fill(T::zero()); Ok(()) } - ScalarValue::Boolean(None) => { + ScalarValue::Null | ScalarValue::Boolean(None) => { // no cells pass predicates, clear bitmap bitmap.fill(T::zero()); Ok(()) } _ => { - // should not be reachable due to return type check + // should not be reachable due to return type check in `Builder::add_predicate` unreachable!() } }, @@ -332,8 +356,12 @@ impl Evaluator { } } Ok(()) + } else if *a.data_type() == DataType::Null { + // no cells pass predicates, clear bitmap + bitmap.fill(T::zero()); + Ok(()) } else { - // should not be reachable due to return type check + // should not be reachable due to return type check in `Builder::add_predicate` unreachable!() } } diff --git a/tiledb/oxidize/test-support/result-tile/src/lib.rs b/tiledb/oxidize/test-support/result-tile/src/lib.rs index 9db38668e7e..ed81a7b8991 100644 --- a/tiledb/oxidize/test-support/result-tile/src/lib.rs +++ b/tiledb/oxidize/test-support/result-tile/src/lib.rs @@ -26,6 +26,9 @@ pub trait TypeTraits { macro_rules! type_traits { ($ty:ty, $primitive_type:ident) => { impl TypeTraits for $ty { + /// Associated `arrow` data type for constructing a `PrimitiveArray`. + // NB: we don't really care about logical type here, if we did we will need a different + // solution. type ArrowPrimitiveType = arrow::datatypes::$primitive_type; } }; diff --git a/tiledb/sm/query/query_condition.cc b/tiledb/sm/query/query_condition.cc index e0a8e6b97bb..169f7bd15aa 100644 --- a/tiledb/sm/query/query_condition.cc +++ b/tiledb/sm/query/query_condition.cc @@ -166,34 +166,6 @@ void QueryCondition::rewrite_for_schema(const ArraySchema& array_schema) { tree_->rewrite_for_schema(array_schema); } -/* -#ifdef HAVE_RUST -rust::Box -QueryCondition::as_datafusion( - const ArraySchema& array_schema, - tiledb::oxidize::arrow::schema::WhichSchema which) { - return tiledb::oxidize::datafusion::logical_expr::create( - array_schema, which, *tree_.get()); -} - -bool QueryCondition::rewrite_to_datafusion( - const ArraySchema& array_schema, - tiledb::oxidize::arrow::schema::WhichSchema which) { - if (!datafusion_.has_value()) { - try { - datafusion_.emplace( - array_schema, which, as_datafusion(array_schema, which)); - } catch (const ::rust::Error& e) { - throw QueryConditionException( - "Error compiling expression: " + std::string(e.what())); - } - return true; - } - return false; -} -#endif -*/ - Status QueryCondition::check(const ArraySchema& array_schema) const { if (!tree_) { return Status::Ok(); diff --git a/tiledb/sm/query/query_condition.h b/tiledb/sm/query/query_condition.h index da8a5fec21b..189dcc3f619 100644 --- a/tiledb/sm/query/query_condition.h +++ b/tiledb/sm/query/query_condition.h @@ -710,7 +710,7 @@ class QueryCondition { }; /** - * + * Bundles the different kinds of predicates which can be attached to a query. */ struct QueryPredicates { std::optional condition_; From 199ea2fc6a650be18cbfda3f6d526ffc26fc19b9 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Mon, 17 Nov 2025 17:17:38 -0500 Subject: [PATCH 52/52] clippy --- tiledb/oxidize/arrow/src/record_batch.rs | 4 ++-- tiledb/oxidize/query-predicates/src/lib.rs | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tiledb/oxidize/arrow/src/record_batch.rs b/tiledb/oxidize/arrow/src/record_batch.rs index 1107b1d7f6b..584afe4518e 100644 --- a/tiledb/oxidize/arrow/src/record_batch.rs +++ b/tiledb/oxidize/arrow/src/record_batch.rs @@ -110,12 +110,12 @@ pub unsafe fn to_record_batch( // SAFETY: the four asserts above rule out each of the possible error conditions let arrow = if columns.is_empty() { RecordBatch::try_new_with_options( - Arc::clone(&schema), + Arc::clone(schema), columns, &RecordBatchOptions::new().with_row_count(Some(tile.cell_num() as usize)), ) } else { - RecordBatch::try_new(Arc::clone(&schema), columns) + RecordBatch::try_new(Arc::clone(schema), columns) }; let arrow = arrow.expect("Logic error: preconditions for constructing RecordBatch not met"); diff --git a/tiledb/oxidize/query-predicates/src/lib.rs b/tiledb/oxidize/query-predicates/src/lib.rs index c642626ab92..7623937c9ef 100644 --- a/tiledb/oxidize/query-predicates/src/lib.rs +++ b/tiledb/oxidize/query-predicates/src/lib.rs @@ -18,6 +18,7 @@ mod ffi { fn compile(&mut self) -> Result<()>; + #[allow(clippy::needless_lifetimes)] // NB: cxx does not seem to handle this lint unsafe fn field_names<'a>(&'a self) -> Vec<&'a str>; #[cxx_name = "add_predicate"]