diff --git a/.github/workflows/cpp-linter.yml b/.github/workflows/cpp-linter.yml index da58dca94..b31e62592 100644 --- a/.github/workflows/cpp-linter.yml +++ b/.github/workflows/cpp-linter.yml @@ -30,7 +30,10 @@ jobs: cpp-linter: runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v4 + - name: Checkout iceberg-cpp + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 - name: Run build run: | mkdir build && cd build diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 502941462..43aebad1b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -55,8 +55,8 @@ jobs: shell: bash run: ci/scripts/build_example.sh $(pwd)/example macos: - name: AArch64 macOS 14 - runs-on: macos-14 + name: AArch64 macOS 15 + runs-on: macos-15 timeout-minutes: 30 strategy: fail-fast: false diff --git a/cmake_modules/IcebergThirdpartyToolchain.cmake b/cmake_modules/IcebergThirdpartyToolchain.cmake index 0f95ea429..53b20c188 100644 --- a/cmake_modules/IcebergThirdpartyToolchain.cmake +++ b/cmake_modules/IcebergThirdpartyToolchain.cmake @@ -65,6 +65,10 @@ function(resolve_arrow_dependency) set(ARROW_BUILD_STATIC ON CACHE BOOL "" FORCE) + # To workaround https://github.com/apache/arrow/pull/45513 + set(ARROW_IPC + ON + CACHE BOOL "" FORCE) set(ARROW_FILESYSTEM OFF CACHE BOOL "" FORCE) @@ -198,3 +202,27 @@ endfunction() if(ICEBERG_AVRO) resolve_avro_dependency() endif() + +# ---------------------------------------------------------------------- +# Nanoarrow + +# It is also possible to vendor nanoarrow using the bundled source code. +function(resolve_nanoarrow_dependency) + prepare_fetchcontent() + + fetchcontent_declare(nanoarrow + ${FC_DECLARE_COMMON_OPTIONS} + URL "https://dlcdn.apache.org/arrow/apache-arrow-nanoarrow-0.6.0/apache-arrow-nanoarrow-0.6.0.tar.gz" + ) + fetchcontent_makeavailable(nanoarrow) + + set_target_properties(nanoarrow PROPERTIES OUTPUT_NAME "iceberg_vendored_nanoarrow" + POSITION_INDEPENDENT_CODE ON) + install(TARGETS nanoarrow + EXPORT iceberg_targets + RUNTIME DESTINATION "${ICEBERG_INSTALL_BINDIR}" + ARCHIVE DESTINATION "${ICEBERG_INSTALL_LIBDIR}" + LIBRARY DESTINATION "${ICEBERG_INSTALL_LIBDIR}") +endfunction() + +resolve_nanoarrow_dependency() diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 8411c7ac5..0c1475bbb 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -15,13 +15,36 @@ # specific language governing permissions and limitations # under the License. -set(ICEBERG_SOURCES demo_table.cc schema.cc schema_field.cc type.cc) +set(ICEBERG_SOURCES + arrow_c_data_internal.cc + demo_table.cc + schema.cc + schema_field.cc + type.cc) + +set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS) +set(ICEBERG_SHARED_BUILD_INTERFACE_LIBS) +set(ICEBERG_STATIC_INSTALL_INTERFACE_LIBS) +set(ICEBERG_SHARED_INSTALL_INTERFACE_LIBS) + +list(APPEND ICEBERG_STATIC_BUILD_INTERFACE_LIBS nanoarrow::nanoarrow) +list(APPEND ICEBERG_SHARED_BUILD_INTERFACE_LIBS nanoarrow::nanoarrow) +list(APPEND ICEBERG_STATIC_INSTALL_INTERFACE_LIBS "Iceberg::nanoarrow") +list(APPEND ICEBERG_SHARED_INSTALL_INTERFACE_LIBS "Iceberg::nanoarrow") add_iceberg_lib(iceberg SOURCES ${ICEBERG_SOURCES} PRIVATE_INCLUDES - ${ICEBERG_INCLUDES}) + ${ICEBERG_INCLUDES} + SHARED_LINK_LIBS + ${ICEBERG_SHARED_BUILD_INTERFACE_LIBS} + STATIC_LINK_LIBS + ${ICEBERG_STATIC_BUILD_INTERFACE_LIBS} + STATIC_INSTALL_INTERFACE_LIBS + ${ICEBERG_STATIC_INSTALL_INTERFACE_LIBS} + SHARED_INSTALL_INTERFACE_LIBS + ${ICEBERG_SHARED_INSTALL_INTERFACE_LIBS}) iceberg_install_all_headers(iceberg) diff --git a/src/iceberg/arrow_c_data.h b/src/iceberg/arrow_c_data.h new file mode 100644 index 000000000..43c2adbd8 --- /dev/null +++ b/src/iceberg/arrow_c_data.h @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/arrow_c_data.h +/// Arrow C data interface +/// +/// The Arrow C Data interface (https://arrow.apache.org/docs/format/CDataInterface.html) +/// is part of the Arrow Columnar Format specification +/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for +/// documentation of these structures. + +#include + +#ifndef ARROW_C_DATA_INTERFACE +# define ARROW_C_DATA_INTERFACE + +extern "C" { +struct ArrowSchema { + // Array type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + + // Release callback + void (*release)(struct ArrowSchema*); + // Opaque producer-specific data + void* private_data; +}; + +struct ArrowArray { + // Array data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; +}; + +} // extern "C" + +#endif // ARROW_C_DATA_INTERFACE diff --git a/src/iceberg/arrow_c_data_internal.cc b/src/iceberg/arrow_c_data_internal.cc new file mode 100644 index 000000000..9716b25a4 --- /dev/null +++ b/src/iceberg/arrow_c_data_internal.cc @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/arrow_c_data_internal.h" + +#include +#include +#include + +namespace iceberg::internal { + +std::pair CreateExampleArrowSchemaAndArrayByNanoarrow() { + ArrowSchema out_schema; + + // Initializes the root struct schema + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(&out_schema, NANOARROW_TYPE_STRUCT)); + NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateChildren(&out_schema, 2)); + + // Set up the non-nullable int64 field + struct ArrowSchema* int64_field = out_schema.children[0]; + ArrowSchemaInit(int64_field); + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(int64_field, NANOARROW_TYPE_INT64)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(int64_field, "id")); + int64_field->flags &= ~ARROW_FLAG_NULLABLE; + + // Set up the nullable string field + struct ArrowSchema* string_field = out_schema.children[1]; + ArrowSchemaInit(string_field); + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(string_field, NANOARROW_TYPE_STRING)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(string_field, "name")); + string_field->flags |= ARROW_FLAG_NULLABLE; + + constexpr int64_t kNumValues = 3; + std::array int64_values = {1, 2, 3}; + std::array string_values = {"a", "b", "c"}; + + ArrowArray out_array; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(&out_array, &out_schema, nullptr)); + ArrowArray* int64_array = out_array.children[0]; + ArrowArray* string_array = out_array.children[1]; + + NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(int64_array)); + NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(string_array)); + + for (int64_t i = 0; i < kNumValues; i++) { + NANOARROW_THROW_NOT_OK(ArrowArrayAppendInt(int64_array, int64_values[i])); + NANOARROW_THROW_NOT_OK( + ArrowArrayAppendString(string_array, ArrowCharView(string_values[i].c_str()))); + } + + NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(int64_array, nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(string_array, nullptr)); + + out_array.length = kNumValues; + out_array.null_count = 0; + + return {out_schema, out_array}; +} + +} // namespace iceberg::internal diff --git a/src/iceberg/arrow_c_data_internal.h b/src/iceberg/arrow_c_data_internal.h new file mode 100644 index 000000000..2d913c55e --- /dev/null +++ b/src/iceberg/arrow_c_data_internal.h @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +namespace iceberg::internal { + +/** + * @brief Create a simple schema with non-nullable int64 and nullable string fields. + * + * This is the example code to demonstrate the usage of nanoarrow API. + */ +std::pair CreateExampleArrowSchemaAndArrayByNanoarrow(); + +} // namespace iceberg::internal diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c8c7fdf61..e29a76ec7 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -23,4 +23,6 @@ fetchcontent_declare(googletest GTest) fetchcontent_makeavailable(googletest) +add_subdirectory(arrow) +add_subdirectory(avro) add_subdirectory(core) diff --git a/test/arrow/CMakeLists.txt b/test/arrow/CMakeLists.txt new file mode 100644 index 000000000..0ef658643 --- /dev/null +++ b/test/arrow/CMakeLists.txt @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +if(ICEBERG_ARROW) + add_executable(arrow_unittest) + target_sources(arrow_unittest PRIVATE arrow_test.cc) + target_link_libraries(arrow_unittest PRIVATE iceberg_arrow_static Arrow::arrow_static + GTest::gtest_main) + target_include_directories(arrow_unittest PRIVATE "${ICEBERG_INCLUDES}") + add_test(NAME arrow_unittest COMMAND arrow_unittest) +endif() diff --git a/test/arrow/arrow_test.cc b/test/arrow/arrow_test.cc new file mode 100644 index 000000000..1d730fc49 --- /dev/null +++ b/test/arrow/arrow_test.cc @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include + +#include "iceberg/arrow_c_data_internal.h" + +namespace iceberg { + +TEST(ArrowCDataTest, CheckArrowSchemaAndArrayByNanoarrow) { + auto [schema, array] = internal::CreateExampleArrowSchemaAndArrayByNanoarrow(); + + auto arrow_schema = ::arrow::ImportSchema(&schema).ValueOrDie(); + EXPECT_EQ(arrow_schema->num_fields(), 2); + + auto id_field = arrow_schema->field(0); + EXPECT_EQ(id_field->name(), "id"); + EXPECT_EQ(id_field->type()->id(), ::arrow::Type::INT64); + EXPECT_FALSE(id_field->nullable()); + + auto name_field = arrow_schema->field(1); + EXPECT_EQ(name_field->name(), "name"); + EXPECT_EQ(name_field->type()->id(), ::arrow::Type::STRING); + EXPECT_TRUE(name_field->nullable()); + + auto arrow_record_batch = ::arrow::ImportRecordBatch(&array, arrow_schema).ValueOrDie(); + EXPECT_EQ(arrow_record_batch->num_rows(), 3); + EXPECT_EQ(arrow_record_batch->num_columns(), 2); + + auto id_column = arrow_record_batch->column(0); + EXPECT_EQ(id_column->type()->id(), ::arrow::Type::INT64); + EXPECT_EQ(id_column->GetScalar(0).ValueOrDie()->ToString(), "1"); + EXPECT_EQ(id_column->GetScalar(1).ValueOrDie()->ToString(), "2"); + EXPECT_EQ(id_column->GetScalar(2).ValueOrDie()->ToString(), "3"); + + auto name_column = arrow_record_batch->column(1); + EXPECT_EQ(name_column->type()->id(), ::arrow::Type::STRING); + EXPECT_EQ(name_column->GetScalar(0).ValueOrDie()->ToString(), "a"); + EXPECT_EQ(name_column->GetScalar(1).ValueOrDie()->ToString(), "b"); + EXPECT_EQ(name_column->GetScalar(2).ValueOrDie()->ToString(), "c"); +} + +} // namespace iceberg diff --git a/test/avro/CMakeLists.txt b/test/avro/CMakeLists.txt new file mode 100644 index 000000000..9cd1c0b8a --- /dev/null +++ b/test/avro/CMakeLists.txt @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +if(ICEBERG_AVRO) + add_executable(avro_unittest) + target_sources(avro_unittest PRIVATE avro_test.cc) + target_link_libraries(avro_unittest PRIVATE iceberg_avro_static GTest::gtest_main) + target_include_directories(avro_unittest PRIVATE "${ICEBERG_INCLUDES}") + add_test(NAME avro_unittest COMMAND avro_unittest) +endif() diff --git a/test/core/avro_unittest.cc b/test/avro/avro_test.cc similarity index 100% rename from test/core/avro_unittest.cc rename to test/avro/avro_test.cc diff --git a/test/core/CMakeLists.txt b/test/core/CMakeLists.txt index 23b084444..6e82b9b40 100644 --- a/test/core/CMakeLists.txt +++ b/test/core/CMakeLists.txt @@ -22,14 +22,6 @@ target_link_libraries(core_unittest PRIVATE iceberg_static GTest::gtest_main GTe target_include_directories(core_unittest PRIVATE "${ICEBERG_INCLUDES}") add_test(NAME core_unittest COMMAND core_unittest) -if(ICEBERG_AVRO) - add_executable(avro_unittest) - target_sources(avro_unittest PRIVATE avro_unittest.cc) - target_link_libraries(avro_unittest PRIVATE iceberg_avro_static GTest::gtest_main) - target_include_directories(avro_unittest PRIVATE "${ICEBERG_INCLUDES}") - add_test(NAME avro_unittest COMMAND avro_unittest) -endif() - add_executable(expected_test) target_sources(expected_test PRIVATE expected_test.cc) target_link_libraries(expected_test PRIVATE iceberg_static GTest::gtest_main)