From cacc1779a95de7d401bb5eb3767cf88d59e87ca2 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Thu, 12 Sep 2024 10:58:45 -0400 Subject: [PATCH 01/12] Add aggregates_string example showing how to run min/max on string field --- examples/c_api/aggregates_string.c | 288 +++++++++++++++++++++++++++++ examples/c_api/tiledb_examples.h | 42 +++++ 2 files changed, 330 insertions(+) create mode 100644 examples/c_api/aggregates_string.c create mode 100644 examples/c_api/tiledb_examples.h diff --git a/examples/c_api/aggregates_string.c b/examples/c_api/aggregates_string.c new file mode 100644 index 00000000000..3dc2eb69d56 --- /dev/null +++ b/examples/c_api/aggregates_string.c @@ -0,0 +1,288 @@ +/** + * @file aggregates_string.c + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2018-2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * When run, this program will create a 2D sparse array with one dimension a + * string type, and the other an integer. This models closely what a dataframe + * looks like. The program will write some data to it, and compute the min + * and max values of the string dimension using aggregates. + */ + +#include +#include +#include +#include + +#include "tiledb_examples.h" + +// Name of array. +const char* array_name = "aggregates_string_array"; + +void create_array() { + // Create TileDB context + tiledb_ctx_t* ctx; + tiledb_ctx_alloc(NULL, &ctx); + + // The array will be 2d array with dimensions "rows" and "cols" + // "rows" is a string dimension type, so the domain and extent is null + int dim_domain[] = {1, 4}; + int tile_extents[] = {4}; + tiledb_dimension_t* d1; + tiledb_dimension_alloc(ctx, "rows", TILEDB_STRING_ASCII, NULL, NULL, &d1); + tiledb_dimension_t* d2; + tiledb_dimension_alloc( + ctx, "cols", TILEDB_INT32, &dim_domain[0], &tile_extents[0], &d2); + + // Create domain + tiledb_domain_t* domain; + tiledb_domain_alloc(ctx, &domain); + tiledb_domain_add_dimension(ctx, domain, d1); + tiledb_domain_add_dimension(ctx, domain, d2); + + // Create a single attribute "a" so each (i,j) cell can store an integer + tiledb_attribute_t* a; + tiledb_attribute_alloc(ctx, "a", TILEDB_INT32, &a); + + // Create array schema + tiledb_array_schema_t* array_schema; + tiledb_array_schema_alloc(ctx, TILEDB_SPARSE, &array_schema); + tiledb_array_schema_set_cell_order(ctx, array_schema, TILEDB_ROW_MAJOR); + tiledb_array_schema_set_tile_order(ctx, array_schema, TILEDB_ROW_MAJOR); + tiledb_array_schema_set_domain(ctx, array_schema, domain); + tiledb_array_schema_add_attribute(ctx, array_schema, a); + + // Create array + tiledb_array_create(ctx, array_name, array_schema); + + // Clean up + tiledb_attribute_free(&a); + tiledb_dimension_free(&d1); + tiledb_dimension_free(&d2); + tiledb_domain_free(&domain); + tiledb_array_schema_free(&array_schema); + tiledb_ctx_free(&ctx); +} + +void write_array() { + // Create TileDB context + tiledb_ctx_t* ctx; + tiledb_ctx_alloc(NULL, &ctx); + + // Open array for writing + tiledb_array_t* array; + tiledb_array_alloc(ctx, array_name, &array); + tiledb_array_open(ctx, array, TILEDB_WRITE); + + // Prepare data for first write + char coords_rows_1[] = {"barbazcorgefoo"}; + uint64_t coords_rows_size_1 = sizeof(coords_rows_1); + uint64_t coords_rows_offsets_1[] = {0, 3, 6, 11}; + uint64_t coords_rows_offsets_size_1 = sizeof(coords_rows_offsets_1); + int coords_cols_1[] = {1, 2, 3, 4}; + uint64_t coords_cols_size_1 = sizeof(coords_cols_1); + int data_1[] = {10, 20, 30, 40}; + uint64_t data_size_1 = sizeof(data_1); + + // Create first query + tiledb_query_t* query; + tiledb_query_alloc(ctx, array, TILEDB_WRITE, &query); + + // Global order enables writes in stages to a single fragment + // but requires input to match global order + tiledb_query_set_layout(ctx, query, TILEDB_GLOBAL_ORDER); + + // Prepare data for first write + TRY(ctx, tiledb_query_set_data_buffer(ctx, query, "a", data_1, &data_size_1)); + TRY(ctx, + tiledb_query_set_data_buffer( + ctx, query, "rows", coords_rows_1, &coords_rows_size_1)); + TRY(ctx, + tiledb_query_set_offsets_buffer( + ctx, + query, + "rows", + coords_rows_offsets_1, + &coords_rows_offsets_size_1)); + TRY(ctx, + tiledb_query_set_data_buffer( + ctx, query, "cols", coords_cols_1, &coords_cols_size_1)); + + // Submit first query + TRY(ctx, tiledb_query_submit(ctx, query)); + + // Prepare data for second write + char coords_rows_2[] = {"garplygraultgubquux"}; + uint64_t coords_rows_size_2 = sizeof(coords_rows_2); + uint64_t coords_rows_offsets_2[] = {0, 6, 12, 15}; + uint64_t coords_rows_offsets_size_2 = sizeof(coords_rows_offsets_2); + int coords_cols_2[] = {1, 2, 3, 4}; + uint64_t coords_cols_size_2 = sizeof(coords_cols_2); + int data_2[] = {50, 60, 70, 80}; + uint64_t data_size_2 = sizeof(data_2); + + // Reset buffers + TRY(ctx, tiledb_query_set_data_buffer(ctx, query, "a", data_2, &data_size_2)); + TRY(ctx, + tiledb_query_set_data_buffer( + ctx, query, "rows", coords_rows_2, &coords_rows_size_2)); + TRY(ctx, + tiledb_query_set_offsets_buffer( + ctx, + query, + "rows", + coords_rows_offsets_2, + &coords_rows_offsets_size_2)); + TRY(ctx, + tiledb_query_set_data_buffer( + ctx, query, "cols", coords_cols_2, &coords_cols_size_2)); + + // Submit second query + TRY(ctx, tiledb_query_submit(ctx, query)); + + // Finalize query (IMPORTANT) + TRY(ctx, tiledb_query_finalize(ctx, query)); + + // Close array + tiledb_array_close(ctx, array); + + // Clean up + tiledb_array_free(&array); + tiledb_query_free(&query); + tiledb_ctx_free(&ctx); +} + +void read_array() { + // Create TileDB context + tiledb_ctx_t* ctx; + tiledb_ctx_alloc(NULL, &ctx); + + // Open array for reading + tiledb_array_t* array; + tiledb_array_alloc(ctx, array_name, &array); + tiledb_array_open(ctx, array, TILEDB_READ); + + // Read entire array - no subarray + + // Calculate maximum buffer sizes + uint64_t max_size = 64; // variable-length result has unknown size + uint64_t max_offsets_size = sizeof(uint64_t); + uint64_t min_size = 64; // variable-length result has unknown size + uint64_t min_offsets_size = sizeof(uint64_t); + + // Result buffers (1 cell each of unknown size) + char* max = (char*)malloc(max_size); + uint64_t max_offsets[1]; + char* min = (char*)malloc(min_size); + uint64_t min_offsets[1]; + + // Create query + tiledb_query_t* query; + tiledb_query_alloc(ctx, array, TILEDB_READ, &query); + + // Get the default channel from the query + tiledb_query_channel_t* default_channel; + tiledb_query_get_default_channel(ctx, query, &default_channel); + + // Apply min aggregate + const tiledb_channel_operator_t* operator_min; + tiledb_channel_operation_t* min_rows; + TRY(ctx, tiledb_channel_operator_min_get(ctx, &operator_min)); + TRY(ctx, + tiledb_create_unary_aggregate( + ctx, query, operator_min, "rows", &min_rows)); + TRY(ctx, + tiledb_channel_apply_aggregate( + ctx, default_channel, "Min(rows)", min_rows)); + + // Apply max aggregate + const tiledb_channel_operator_t* operator_max; + tiledb_channel_operation_t* max_rows; + TRY(ctx, tiledb_channel_operator_max_get(ctx, &operator_max)); + TRY(ctx, + tiledb_create_unary_aggregate( + ctx, query, operator_max, "rows", &max_rows)); + TRY(ctx, + tiledb_channel_apply_aggregate( + ctx, default_channel, "Max(rows)", max_rows)); + + TRY(ctx, tiledb_query_set_layout(ctx, query, TILEDB_UNORDERED)); + TRY(ctx, + tiledb_query_set_data_buffer(ctx, query, "Min(rows)", min, &min_size)); + TRY(ctx, + tiledb_query_set_offsets_buffer( + ctx, query, "Min(rows)", min_offsets, &min_offsets_size)); + TRY(ctx, + tiledb_query_set_data_buffer(ctx, query, "Max(rows)", max, &max_size)); + TRY(ctx, + tiledb_query_set_offsets_buffer( + ctx, query, "Max(rows)", max_offsets, &max_offsets_size)); + + // Submit query + tiledb_query_submit(ctx, query); + + // Close array + tiledb_array_close(ctx, array); + + // Print out the results. + printf( + "Min has data %.*s\n", + (int)(min_size - min_offsets[0]), + &min[min_offsets[0]]); + printf( + "Max has data %.*s\n", + (int)(max_size - max_offsets[0]), + &max[max_offsets[0]]); + + // Clean up + free((void*)min); + free((void*)max); + tiledb_aggregate_free(ctx, &min_rows); + tiledb_aggregate_free(ctx, &max_rows); + tiledb_query_channel_free(ctx, &default_channel); + tiledb_array_free(&array); + tiledb_query_free(&query); + tiledb_ctx_free(&ctx); +} + +int main() { + // Get object type + tiledb_ctx_t* ctx; + tiledb_ctx_alloc(NULL, &ctx); + tiledb_object_t type; + tiledb_object_type(ctx, array_name, &type); + tiledb_ctx_free(&ctx); + + if (type != TILEDB_ARRAY) { + create_array(); + write_array(); + } + + read_array(); + return 0; +} + diff --git a/examples/c_api/tiledb_examples.h b/examples/c_api/tiledb_examples.h new file mode 100644 index 00000000000..6bdfcc8cfe8 --- /dev/null +++ b/examples/c_api/tiledb_examples.h @@ -0,0 +1,42 @@ +#ifndef TILEDB_EXAMPLES_H +#define TILEDB_EXAMPLES_H + +#include +#include + +static int try_print_error(int line, tiledb_ctx_t* ctx) { + // Retrieve the last error that occurred + tiledb_error_t* err = NULL; + tiledb_ctx_get_last_error(ctx, &err); + + if (err == NULL) { + return TILEDB_OK; + } + + const char* msg; + tiledb_error_message(err, &msg); + fprintf(stderr, "%d: %s\n", line, msg); + + // Clean up + tiledb_error_free(&err); + + return TILEDB_ERR; +} + +#define IF_ERROR_EXIT(ctx) \ + do { \ + if (try_print_error(__LINE__, (ctx)) != TILEDB_OK) { \ + exit(TILEDB_ERR); \ + } \ + } while (0) + +#define TRY(ctx, capi_call) \ + do { \ + const capi_return_t __ret = (capi_call); \ + if (__ret != TILEDB_OK) { \ + IF_ERROR_EXIT(ctx); \ + } \ + } while (0) + +#endif + From cf3f77649d57a262539e6ddfbca3683e3493b0e6 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Thu, 12 Sep 2024 11:04:14 -0400 Subject: [PATCH 02/12] Add query condition to aggregates_string.c --- examples/c_api/aggregates_string.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/examples/c_api/aggregates_string.c b/examples/c_api/aggregates_string.c index 3dc2eb69d56..bb82764b3a7 100644 --- a/examples/c_api/aggregates_string.c +++ b/examples/c_api/aggregates_string.c @@ -105,7 +105,7 @@ void write_array() { uint64_t coords_rows_offsets_size_1 = sizeof(coords_rows_offsets_1); int coords_cols_1[] = {1, 2, 3, 4}; uint64_t coords_cols_size_1 = sizeof(coords_cols_1); - int data_1[] = {10, 20, 30, 40}; + int data_1[] = {3, 3, 5, 3}; uint64_t data_size_1 = sizeof(data_1); // Create first query @@ -142,7 +142,7 @@ void write_array() { uint64_t coords_rows_offsets_size_2 = sizeof(coords_rows_offsets_2); int coords_cols_2[] = {1, 2, 3, 4}; uint64_t coords_cols_size_2 = sizeof(coords_cols_2); - int data_2[] = {50, 60, 70, 80}; + int data_2[] = {6, 6, 3, 4}; uint64_t data_size_2 = sizeof(data_2); // Reset buffers @@ -186,8 +186,6 @@ void read_array() { tiledb_array_alloc(ctx, array_name, &array); tiledb_array_open(ctx, array, TILEDB_READ); - // Read entire array - no subarray - // Calculate maximum buffer sizes uint64_t max_size = 64; // variable-length result has unknown size uint64_t max_offsets_size = sizeof(uint64_t); @@ -204,6 +202,14 @@ void read_array() { tiledb_query_t* query; tiledb_query_alloc(ctx, array, TILEDB_READ, &query); + // Query cells with a >= 4 + tiledb_query_condition_t* qc; + tiledb_query_condition_alloc(ctx, &qc); + const int32_t a_lower_bound = 4; + tiledb_query_condition_init( + ctx, qc, "a", &a_lower_bound, sizeof(int32_t), TILEDB_GE); + tiledb_query_set_condition(ctx, query, qc); + // Get the default channel from the query tiledb_query_channel_t* default_channel; tiledb_query_get_default_channel(ctx, query, &default_channel); From ca6a86240d3c0edfc2a6e97b2d9575ea00debebe Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Thu, 12 Sep 2024 11:14:25 -0400 Subject: [PATCH 03/12] aggregates_string_c also emits values of dimensions/attribute --- examples/c_api/aggregates_string.c | 44 +++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/examples/c_api/aggregates_string.c b/examples/c_api/aggregates_string.c index bb82764b3a7..489f12d833f 100644 --- a/examples/c_api/aggregates_string.c +++ b/examples/c_api/aggregates_string.c @@ -192,12 +192,23 @@ void read_array() { uint64_t min_size = 64; // variable-length result has unknown size uint64_t min_offsets_size = sizeof(uint64_t); - // Result buffers (1 cell each of unknown size) + // Aggregate result buffers (1 cell each of unknown size) char* max = (char*)malloc(max_size); uint64_t max_offsets[1]; char* min = (char*)malloc(min_size); uint64_t min_offsets[1]; + // Attribute/dimension buffers + // (unknown number of cells, buffer sizes are estimates) + char rows_data[64]; + uint64_t rows_data_size = sizeof(rows_data); + uint64_t rows_offsets[8]; + uint64_t rows_offsets_size = sizeof(rows_offsets); + int32_t cols_data[8]; + uint64_t cols_size = sizeof(cols_data); + int32_t a_data[8]; + uint64_t a_size = sizeof(a_data); + // Create query tiledb_query_t* query; tiledb_query_alloc(ctx, array, TILEDB_READ, &query); @@ -210,6 +221,18 @@ void read_array() { ctx, qc, "a", &a_lower_bound, sizeof(int32_t), TILEDB_GE); tiledb_query_set_condition(ctx, query, qc); + // Add attribute/dimension result buffers + TRY(ctx, + tiledb_query_set_data_buffer( + ctx, query, "rows", &rows_data[0], &rows_data_size)); + TRY(ctx, + tiledb_query_set_offsets_buffer( + ctx, query, "rows", &rows_offsets[0], &rows_offsets_size)); + TRY(ctx, + tiledb_query_set_data_buffer( + ctx, query, "cols", &cols_data[0], &cols_size)); + TRY(ctx, tiledb_query_set_data_buffer(ctx, query, "a", &a_data[0], &a_size)); + // Get the default channel from the query tiledb_query_channel_t* default_channel; tiledb_query_get_default_channel(ctx, query, &default_channel); @@ -264,6 +287,25 @@ void read_array() { (int)(max_size - max_offsets[0]), &max[max_offsets[0]]); + uint64_t result_num = (uint64_t)(a_size / sizeof(int32_t)); + for (uint64_t r = 0; r < result_num; r++) { + // For strings we must compute the length based on the offsets + uint64_t row_start = rows_offsets[r]; + uint64_t row_end = + r == result_num - 1 ? result_num : rows_offsets[r + 1] - 1; + const int row_value_size = row_end - row_start + 1; + const char* row_value = &rows_data[row_start]; + + const int32_t col_value = cols_data[r]; + const int32_t a_value = a_data[r]; + printf( + "Cell (%.*s, %i) has data %d\n", + row_value_size, + row_value, + col_value, + a_value); + } + // Clean up free((void*)min); free((void*)max); From 1eb8123daa868cff2b88294abdd495622e028e54 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Thu, 12 Sep 2024 11:25:50 -0400 Subject: [PATCH 04/12] Add INCOMPLETE loop to aggregates_string.c --- examples/c_api/aggregates_string.c | 80 ++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 25 deletions(-) diff --git a/examples/c_api/aggregates_string.c b/examples/c_api/aggregates_string.c index 489f12d833f..38b995b685d 100644 --- a/examples/c_api/aggregates_string.c +++ b/examples/c_api/aggregates_string.c @@ -176,6 +176,32 @@ void write_array() { tiledb_ctx_free(&ctx); } +void print_cells( + uint64_t result_num, + uint64_t* rows_offsets, + uint64_t rows_data_size, + char* rows_data, + int32_t* cols_data, + int32_t* a_data) { + for (uint64_t r = 0; r < result_num; r++) { + // For strings we must compute the length based on the offsets + uint64_t row_start = rows_offsets[r]; + uint64_t row_end = + r == result_num - 1 ? rows_data_size : rows_offsets[r + 1] - 1; + const int row_value_size = row_end - row_start + 1; + const char* row_value = &rows_data[row_start]; + + const int32_t col_value = cols_data[r]; + const int32_t a_value = a_data[r]; + printf( + "Cell (%.*s, %i) has data %d\n", + row_value_size, + row_value, + col_value, + a_value); + } +} + void read_array() { // Create TileDB context tiledb_ctx_t* ctx; @@ -200,13 +226,14 @@ void read_array() { // Attribute/dimension buffers // (unknown number of cells, buffer sizes are estimates) - char rows_data[64]; + const size_t NUM_CELLS = 2; + char rows_data[NUM_CELLS * 16]; uint64_t rows_data_size = sizeof(rows_data); - uint64_t rows_offsets[8]; + uint64_t rows_offsets[NUM_CELLS]; uint64_t rows_offsets_size = sizeof(rows_offsets); - int32_t cols_data[8]; + int32_t cols_data[NUM_CELLS]; uint64_t cols_size = sizeof(cols_data); - int32_t a_data[8]; + int32_t a_data[NUM_CELLS]; uint64_t a_size = sizeof(a_data); // Create query @@ -272,12 +299,34 @@ void read_array() { ctx, query, "Max(rows)", max_offsets, &max_offsets_size)); // Submit query - tiledb_query_submit(ctx, query); + TRY(ctx, tiledb_query_submit(ctx, query)); + + tiledb_query_status_t status; + TRY(ctx, tiledb_query_get_status(ctx, query, &status)); + while (status == TILEDB_INCOMPLETE) { + print_cells( + a_size / sizeof(int32_t), + rows_offsets, + rows_data_size, + rows_data, + cols_data, + a_data); + + TRY(ctx, tiledb_query_submit(ctx, query)); + TRY(ctx, tiledb_query_get_status(ctx, query, &status)); + } // Close array tiledb_array_close(ctx, array); - // Print out the results. + // Print out the final results. + print_cells( + a_size / sizeof(int32_t), + rows_offsets, + rows_data_size, + rows_data, + cols_data, + a_data); printf( "Min has data %.*s\n", (int)(min_size - min_offsets[0]), @@ -287,25 +336,6 @@ void read_array() { (int)(max_size - max_offsets[0]), &max[max_offsets[0]]); - uint64_t result_num = (uint64_t)(a_size / sizeof(int32_t)); - for (uint64_t r = 0; r < result_num; r++) { - // For strings we must compute the length based on the offsets - uint64_t row_start = rows_offsets[r]; - uint64_t row_end = - r == result_num - 1 ? result_num : rows_offsets[r + 1] - 1; - const int row_value_size = row_end - row_start + 1; - const char* row_value = &rows_data[row_start]; - - const int32_t col_value = cols_data[r]; - const int32_t a_value = a_data[r]; - printf( - "Cell (%.*s, %i) has data %d\n", - row_value_size, - row_value, - col_value, - a_value); - } - // Clean up free((void*)min); free((void*)max); From b230180bb71c32bd897a4fbc0a47383f35234123 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Thu, 12 Sep 2024 11:29:05 -0400 Subject: [PATCH 05/12] comments, formatting --- examples/c_api/aggregates_string.c | 7 +++---- examples/c_api/tiledb_examples.h | 17 ++++++++++++++++- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/examples/c_api/aggregates_string.c b/examples/c_api/aggregates_string.c index 38b995b685d..6b87ce515ec 100644 --- a/examples/c_api/aggregates_string.c +++ b/examples/c_api/aggregates_string.c @@ -28,9 +28,9 @@ * @section DESCRIPTION * * When run, this program will create a 2D sparse array with one dimension a - * string type, and the other an integer. This models closely what a dataframe - * looks like. The program will write some data to it, and compute the min - * and max values of the string dimension using aggregates. + * string type, and the other an integer. The program will write some data to + * it, and run a query to select coordinates and compute the min and max values + * of the string dimension using aggregates. */ #include @@ -363,4 +363,3 @@ int main() { read_array(); return 0; } - diff --git a/examples/c_api/tiledb_examples.h b/examples/c_api/tiledb_examples.h index 6bdfcc8cfe8..826357000ab 100644 --- a/examples/c_api/tiledb_examples.h +++ b/examples/c_api/tiledb_examples.h @@ -4,6 +4,15 @@ #include #include +/** + * Attempt to retrieve an error from the tiledb context + * and print to stderr if present. + * + * @param line the line number of the last API call + * @param ctx the context pointer + * + * @return TILEDB_OK if no error was found, TILEDB_ERR if one was. + */ static int try_print_error(int line, tiledb_ctx_t* ctx) { // Retrieve the last error that occurred tiledb_error_t* err = NULL; @@ -23,6 +32,10 @@ static int try_print_error(int line, tiledb_ctx_t* ctx) { return TILEDB_ERR; } +/** + * Attempt to retrieve an error from the tiledb context. + * If present, print to stderr and exit. + */ #define IF_ERROR_EXIT(ctx) \ do { \ if (try_print_error(__LINE__, (ctx)) != TILEDB_OK) { \ @@ -30,6 +43,9 @@ static int try_print_error(int line, tiledb_ctx_t* ctx) { } \ } while (0) +/** + * Run a tiledb API and then check for errors, exiting if one is found. + */ #define TRY(ctx, capi_call) \ do { \ const capi_return_t __ret = (capi_call); \ @@ -39,4 +55,3 @@ static int try_print_error(int line, tiledb_ctx_t* ctx) { } while (0) #endif - From 2f4f24ffde0351c27b1cd93316d5d02f770a046e Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 17 Sep 2024 10:35:29 -0400 Subject: [PATCH 06/12] aggregates_string.cc --- examples/cpp_api/aggregates_string.cc | 226 ++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 examples/cpp_api/aggregates_string.cc diff --git a/examples/cpp_api/aggregates_string.cc b/examples/cpp_api/aggregates_string.cc new file mode 100644 index 00000000000..57e05817df6 --- /dev/null +++ b/examples/cpp_api/aggregates_string.cc @@ -0,0 +1,226 @@ +/** + * @file aggregates_string.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2018-2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * When run, this program will create a 2D sparse array with on dimension a + * string type, and the other an integer. The program will write some data to + * it, and run a query to select coordinates and compute the min and max values + * of the string dimension using aggregates. + */ + +#include +#include +#include + +using namespace tiledb; + +// Name of array +std::string array_name("aggregates_string_array"); + +void create_array() { + // Create a TileDB context. + Context ctx; + + // The array will be 2d array with dimensions "rows" and "cols" + // "rows" is a string dimension type, so the domain and extent is null + Domain domain(ctx); + domain + .add_dimension( + Dimension::create(ctx, "rows", TILEDB_STRING_ASCII, nullptr, nullptr)) + .add_dimension(Dimension::create(ctx, "cols", {{1, 4}}, 4)); + + // The array will be sparse. + ArraySchema schema(ctx, TILEDB_SPARSE); + schema.set_domain(domain).set_order({{TILEDB_ROW_MAJOR, TILEDB_ROW_MAJOR}}); + + // Add a single attribute "a" so each (i,j) cell can store an integer. + schema.add_attribute(Attribute::create(ctx, "a")); + + // Create the (empty) array on disk. + Array::create(array_name, schema); +} + +void write_array() { + Context ctx; + + Array array(ctx, array_name, TILEDB_WRITE); + Query query(ctx, array, TILEDB_WRITE); + + // Global order enables writes in stages to a single fragment + // but requires input to match global order + query.set_layout(TILEDB_GLOBAL_ORDER); + + // First write + char rows_1[] = {"barbazcorgefoo"}; + std::vector rows_offsets_1 = {0, 3, 6, 11}; + std::vector cols_1 = {1, 2, 3, 4}; + std::vector a_1 = {3, 3, 5, 3}; + + query.set_data_buffer("a", a_1) + .set_data_buffer("rows", &rows_1[0], sizeof(rows_1)) + .set_offsets_buffer("rows", rows_offsets_1) + .set_data_buffer("cols", cols_1); + query.submit(); + + // Second write + char rows_2[] = {"garplygraultgubquux"}; + std::vector rows_offsets_2 = {0, 6, 12, 15}; + std::vector cols_2 = {1, 2, 3, 4}; + std::vector a_2 = {6, 6, 3, 4}; + + query.set_data_buffer("a", a_2) + .set_data_buffer("rows", &rows_2[0], sizeof(rows_2)) + .set_offsets_buffer("rows", rows_offsets_2) + .set_data_buffer("cols", cols_2); + query.submit(); + + // Finalize the write (IMPORTANT) and close the array. + query.finalize(); + array.close(); +} + +void print_cells( + uint64_t result_num, + uint64_t* rows_offsets, + uint64_t rows_data_size, + char* rows_data, + int32_t* cols_data, + int32_t* a_data) { + for (uint64_t r = 0; r < result_num; r++) { + // For strings we must compute the length based on the offsets + uint64_t row_start = rows_offsets[r]; + uint64_t row_end = + r == result_num - 1 ? rows_data_size : rows_offsets[r + 1]; + const int row_value_size = row_end - row_start; + const char* row_value = &rows_data[row_start]; + + const int32_t col_value = cols_data[r]; + const int32_t a_value = a_data[r]; + printf( + "Cell (%.*s, %i) has data %d\n", + row_value_size, + row_value, + col_value, + a_value); + } +} + +void read_array() { + Context ctx; + + // Prepare the array for reading + Array array(ctx, array_name, TILEDB_READ); + + // Attribute/dimension buffeers + // (unknown number of cells, buffer sizes are estimates; + // query may be read in multiple stages) + constexpr size_t NUM_CELLS = 2; + std::vector rows_data(NUM_CELLS * 16); + std::vector rows_offsets(NUM_CELLS); + std::vector cols_data(NUM_CELLS); + std::vector a_data(NUM_CELLS); + + // Aggregate result buffers (1 cell each of unknown size) + constexpr size_t MAX_RESULT_LENGTH = 64; + std::vector min_value(MAX_RESULT_LENGTH); + std::vector min_offsets(1); + std::vector max_value(MAX_RESULT_LENGTH); + std::vector max_offsets(1); + + // Create a query + Query query(ctx, array); + + // Query cells with a >= 4 + QueryCondition qc(ctx); + int32_t a_lower_bound = 4; + qc.init("a", &a_lower_bound, sizeof(int32_t), TILEDB_GE); + query.set_condition(qc); + + // Add aggregates for min(rows) and max(rows) on the default channel. + QueryChannel default_channel = QueryExperimental::get_default_channel(query); + ChannelOperation min_rows = + QueryExperimental::create_unary_aggregate(query, "rows"); + default_channel.apply_aggregate("Min(rows)", min_rows); + ChannelOperation max_rows = + QueryExperimental::create_unary_aggregate(query, "rows"); + default_channel.apply_aggregate("Max(rows)", max_rows); + + // Set layout and buffers. + query.set_layout(TILEDB_UNORDERED) + .set_data_buffer("rows", rows_data) + .set_offsets_buffer("rows", rows_offsets) + .set_data_buffer("cols", cols_data) + .set_data_buffer("a", a_data) + .set_data_buffer("Min(rows)", min_value) + .set_offsets_buffer("Min(rows)", min_offsets) + .set_data_buffer("Max(rows)", max_value) + .set_offsets_buffer("Max(rows)", max_offsets); + + auto print_current_cells = [&]() { + print_cells( + query.result_buffer_elements()["rows"].first, + &rows_offsets[0], + query.result_buffer_elements()["rows"].second, + &rows_data[0], + &cols_data[0], + &a_data[0]); + }; + + // Submit the query and close the array. + while (query.submit() == Query::Status::INCOMPLETE) { + const size_t num_results = query.result_buffer_elements()["rows"].first; + + // NB: this is not generically a valid assertion + // (see reading_incomplete.cc) + // but is true by construction in this example + assert(num_results > 0); + + print_current_cells(); + } + array.close(); + + print_current_cells(); + + // Print out the results. + const size_t min_value_size = + query.result_buffer_elements()["Min(rows)"].second; + const size_t max_value_size = + query.result_buffer_elements()["Max(rows)"].second; + std::cout << "Min(rows) = " << std::string(&min_value[0], min_value_size) + << std::endl; + std::cout << "Max(rows) = " << std::string(&max_value[0], max_value_size) + << std::endl; +} + +int main() { + create_array(); + write_array(); + read_array(); + return 0; +} + From efd5b06f8b85a78a57f9ff7cca1067415440b50b Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 17 Sep 2024 10:35:52 -0400 Subject: [PATCH 07/12] Fix aggregates_string.cc result size bug --- examples/c_api/aggregates_string.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/c_api/aggregates_string.c b/examples/c_api/aggregates_string.c index 6b87ce515ec..b08987e0f56 100644 --- a/examples/c_api/aggregates_string.c +++ b/examples/c_api/aggregates_string.c @@ -187,8 +187,8 @@ void print_cells( // For strings we must compute the length based on the offsets uint64_t row_start = rows_offsets[r]; uint64_t row_end = - r == result_num - 1 ? rows_data_size : rows_offsets[r + 1] - 1; - const int row_value_size = row_end - row_start + 1; + r == result_num - 1 ? rows_data_size : rows_offsets[r + 1]; + const int row_value_size = row_end - row_start; const char* row_value = &rows_data[row_start]; const int32_t col_value = cols_data[r]; From d110916ecfc695138d07db27e160cfe3f3c100a5 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 17 Sep 2024 10:46:39 -0400 Subject: [PATCH 08/12] Review cosmetics --- examples/c_api/aggregates_string.c | 11 ++++------- examples/c_api/tiledb_examples.h | 8 ++++---- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/examples/c_api/aggregates_string.c b/examples/c_api/aggregates_string.c index b08987e0f56..0294a84fe59 100644 --- a/examples/c_api/aggregates_string.c +++ b/examples/c_api/aggregates_string.c @@ -226,7 +226,7 @@ void read_array() { // Attribute/dimension buffers // (unknown number of cells, buffer sizes are estimates) - const size_t NUM_CELLS = 2; +#define NUM_CELLS 2 char rows_data[NUM_CELLS * 16]; uint64_t rows_data_size = sizeof(rows_data); uint64_t rows_offsets[NUM_CELLS]; @@ -235,6 +235,7 @@ void read_array() { uint64_t cols_size = sizeof(cols_data); int32_t a_data[NUM_CELLS]; uint64_t a_size = sizeof(a_data); +#undef NUM_CELLS // Create query tiledb_query_t* query; @@ -265,23 +266,19 @@ void read_array() { tiledb_query_get_default_channel(ctx, query, &default_channel); // Apply min aggregate - const tiledb_channel_operator_t* operator_min; tiledb_channel_operation_t* min_rows; - TRY(ctx, tiledb_channel_operator_min_get(ctx, &operator_min)); TRY(ctx, tiledb_create_unary_aggregate( - ctx, query, operator_min, "rows", &min_rows)); + ctx, query, tiledb_channel_operator_min, "rows", &min_rows)); TRY(ctx, tiledb_channel_apply_aggregate( ctx, default_channel, "Min(rows)", min_rows)); // Apply max aggregate - const tiledb_channel_operator_t* operator_max; tiledb_channel_operation_t* max_rows; - TRY(ctx, tiledb_channel_operator_max_get(ctx, &operator_max)); TRY(ctx, tiledb_create_unary_aggregate( - ctx, query, operator_max, "rows", &max_rows)); + ctx, query, tiledb_channel_operator_max, "rows", &max_rows)); TRY(ctx, tiledb_channel_apply_aggregate( ctx, default_channel, "Max(rows)", max_rows)); diff --git a/examples/c_api/tiledb_examples.h b/examples/c_api/tiledb_examples.h index 826357000ab..383e2518c50 100644 --- a/examples/c_api/tiledb_examples.h +++ b/examples/c_api/tiledb_examples.h @@ -11,7 +11,7 @@ * @param line the line number of the last API call * @param ctx the context pointer * - * @return TILEDB_OK if no error was found, TILEDB_ERR if one was. + * @return 0 if no error was found, 1 if one was. */ static int try_print_error(int line, tiledb_ctx_t* ctx) { // Retrieve the last error that occurred @@ -19,7 +19,7 @@ static int try_print_error(int line, tiledb_ctx_t* ctx) { tiledb_ctx_get_last_error(ctx, &err); if (err == NULL) { - return TILEDB_OK; + return 0; } const char* msg; @@ -29,7 +29,7 @@ static int try_print_error(int line, tiledb_ctx_t* ctx) { // Clean up tiledb_error_free(&err); - return TILEDB_ERR; + return 1; } /** @@ -39,7 +39,7 @@ static int try_print_error(int line, tiledb_ctx_t* ctx) { #define IF_ERROR_EXIT(ctx) \ do { \ if (try_print_error(__LINE__, (ctx)) != TILEDB_OK) { \ - exit(TILEDB_ERR); \ + abort(); \ } \ } while (0) From a8e7e2e34b4b82d32394360f04a82cc399b4bfc8 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 17 Sep 2024 10:48:30 -0400 Subject: [PATCH 09/12] Stack allocation instead of malloc --- examples/c_api/aggregates_string.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/c_api/aggregates_string.c b/examples/c_api/aggregates_string.c index 0294a84fe59..5dbdefcf2b7 100644 --- a/examples/c_api/aggregates_string.c +++ b/examples/c_api/aggregates_string.c @@ -213,16 +213,20 @@ void read_array() { tiledb_array_open(ctx, array, TILEDB_READ); // Calculate maximum buffer sizes - uint64_t max_size = 64; // variable-length result has unknown size +#define VAR_BUFFER_SIZE 64 + uint64_t max_size = + VAR_BUFFER_SIZE; // variable-length result has unknown size uint64_t max_offsets_size = sizeof(uint64_t); - uint64_t min_size = 64; // variable-length result has unknown size + uint64_t min_size = + VAR_BUFFER_SIZE; // variable-length result has unknown size uint64_t min_offsets_size = sizeof(uint64_t); // Aggregate result buffers (1 cell each of unknown size) - char* max = (char*)malloc(max_size); + char max[VAR_BUFFER_SIZE]; uint64_t max_offsets[1]; - char* min = (char*)malloc(min_size); + char min[VAR_BUFFER_SIZE]; uint64_t min_offsets[1]; +#undef VAR_BUFFER_SIZE // Attribute/dimension buffers // (unknown number of cells, buffer sizes are estimates) @@ -334,8 +338,6 @@ void read_array() { &max[max_offsets[0]]); // Clean up - free((void*)min); - free((void*)max); tiledb_aggregate_free(ctx, &min_rows); tiledb_aggregate_free(ctx, &max_rows); tiledb_query_channel_free(ctx, &default_channel); From d90478af5352e83b4c260fc7efc010b61db597e9 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 17 Sep 2024 10:52:17 -0400 Subject: [PATCH 10/12] Use single context for duration of aggregates_string.c example --- examples/c_api/aggregates_string.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/examples/c_api/aggregates_string.c b/examples/c_api/aggregates_string.c index 5dbdefcf2b7..9437eb26681 100644 --- a/examples/c_api/aggregates_string.c +++ b/examples/c_api/aggregates_string.c @@ -43,11 +43,7 @@ // Name of array. const char* array_name = "aggregates_string_array"; -void create_array() { - // Create TileDB context - tiledb_ctx_t* ctx; - tiledb_ctx_alloc(NULL, &ctx); - +void create_array(tiledb_ctx_t* ctx) { // The array will be 2d array with dimensions "rows" and "cols" // "rows" is a string dimension type, so the domain and extent is null int dim_domain[] = {1, 4}; @@ -85,14 +81,9 @@ void create_array() { tiledb_dimension_free(&d2); tiledb_domain_free(&domain); tiledb_array_schema_free(&array_schema); - tiledb_ctx_free(&ctx); } -void write_array() { - // Create TileDB context - tiledb_ctx_t* ctx; - tiledb_ctx_alloc(NULL, &ctx); - +void write_array(tiledb_ctx_t* ctx) { // Open array for writing tiledb_array_t* array; tiledb_array_alloc(ctx, array_name, &array); @@ -173,7 +164,6 @@ void write_array() { // Clean up tiledb_array_free(&array); tiledb_query_free(&query); - tiledb_ctx_free(&ctx); } void print_cells( @@ -202,11 +192,7 @@ void print_cells( } } -void read_array() { - // Create TileDB context - tiledb_ctx_t* ctx; - tiledb_ctx_alloc(NULL, &ctx); - +void read_array(tiledb_ctx_t* ctx) { // Open array for reading tiledb_array_t* array; tiledb_array_alloc(ctx, array_name, &array); @@ -343,7 +329,6 @@ void read_array() { tiledb_query_channel_free(ctx, &default_channel); tiledb_array_free(&array); tiledb_query_free(&query); - tiledb_ctx_free(&ctx); } int main() { @@ -352,13 +337,13 @@ int main() { tiledb_ctx_alloc(NULL, &ctx); tiledb_object_t type; tiledb_object_type(ctx, array_name, &type); - tiledb_ctx_free(&ctx); if (type != TILEDB_ARRAY) { - create_array(); - write_array(); + create_array(ctx); + write_array(ctx); } - read_array(); + read_array(ctx); + tiledb_ctx_free(&ctx); return 0; } From 32d6dad242f1c76e699ded41df0b42cd0e649d50 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 17 Sep 2024 10:52:45 -0400 Subject: [PATCH 11/12] Fix formatting --- examples/cpp_api/aggregates_string.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/cpp_api/aggregates_string.cc b/examples/cpp_api/aggregates_string.cc index 57e05817df6..a2e6f966d4d 100644 --- a/examples/cpp_api/aggregates_string.cc +++ b/examples/cpp_api/aggregates_string.cc @@ -223,4 +223,3 @@ int main() { read_array(); return 0; } - From 72a750ece6b3432373748a1bcb5dd28ff9d78fbf Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Tue, 17 Sep 2024 10:58:04 -0400 Subject: [PATCH 12/12] Assert that INCOMPLETE still makes progress --- examples/c_api/aggregates_string.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/c_api/aggregates_string.c b/examples/c_api/aggregates_string.c index 9437eb26681..00044cc3798 100644 --- a/examples/c_api/aggregates_string.c +++ b/examples/c_api/aggregates_string.c @@ -33,6 +33,7 @@ * of the string dimension using aggregates. */ +#include #include #include #include @@ -291,8 +292,15 @@ void read_array(tiledb_ctx_t* ctx) { tiledb_query_status_t status; TRY(ctx, tiledb_query_get_status(ctx, query, &status)); while (status == TILEDB_INCOMPLETE) { + const uint64_t num_results = a_size / sizeof(int32_t); + + // NB: this is not generically a valid assertion + // (see reading_incomplete.c) + // but is true by construction in this example + assert(num_results); + print_cells( - a_size / sizeof(int32_t), + num_results, rows_offsets, rows_data_size, rows_data,