diff --git a/CHANGELOG.md b/CHANGELOG.md index 5774601dab8..fe364eb970c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,25 @@ +# Apache Arrow 19.0.1 (2025-01-30 08:00:00+00:00) + +## Bug Fixes + +* [GH-44513](https://github.com/apache/arrow/issues/44513) - [C++][Python] Pyarrow.Table.join() breaks on large tables v.18.0.0.dev486 +* [GH-45180](https://github.com/apache/arrow/issues/45180) - [C++][Fuzzing] Fix bug discovered by fuzzing +* [GH-45230](https://github.com/apache/arrow/issues/45230) - [Docs] Add LinkedIn social link and fix top nav scaling problems +* [GH-45283](https://github.com/apache/arrow/issues/45283) - [Python][C++][Parquet] "OSError: Repetition level histogram size mismatch" when reading parquet file in pyarrow since 19.0.0 +* [GH-45296](https://github.com/apache/arrow/issues/45296) - [Python] to\_pandas() fails when pandas option 'future.infer\_string' is True +* [GH-45339](https://github.com/apache/arrow/issues/45339) - [Parquet][C++] Reading parquet with an empty list of row group indices fails +* [GH-45357](https://github.com/apache/arrow/issues/45357) - [C++] Disable failing arrow-flight-test when misusing the library + + +## Improvements + +* [GH-45201](https://github.com/apache/arrow/issues/45201) - [C++][Parquet] Improve performance of writing size statistics +* [GH-45304](https://github.com/apache/arrow/issues/45304) - [C++] Compatibility with newer aws sdk +* [GH-45305](https://github.com/apache/arrow/issues/45305) - [Python] Compatibility with boto 1.36 + + + # Apache Arrow 19.0.0 (2025-01-10) ## New Features and Improvements diff --git a/c_glib/meson.build b/c_glib/meson.build index c668e30e673..453d38e14cb 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -35,7 +35,7 @@ project('arrow-glib', 'c', 'cpp', # * 22.04: 0.61.2 meson_version: '>=0.53.2') -version = '19.0.0' +version = '19.0.1' if version.endswith('-SNAPSHOT') version_numbers = version.split('-')[0].split('.') version_tag = version.split('-')[1] diff --git a/c_glib/vcpkg.json b/c_glib/vcpkg.json index 88b010d1b3e..139589ad3ea 100644 --- a/c_glib/vcpkg.json +++ b/c_glib/vcpkg.json @@ -1,6 +1,6 @@ { "name": "arrow-glib", - "version-string": "19.0.0", + "version-string": "19.0.1", "dependencies": [ "glib", "gobject-introspection", diff --git a/ci/appveyor-cpp-setup.bat b/ci/appveyor-cpp-setup.bat index 912b130acff..a5ca7f76bd1 100644 --- a/ci/appveyor-cpp-setup.bat +++ b/ci/appveyor-cpp-setup.bat @@ -86,7 +86,7 @@ set CXX=cl.exe @rem Download Minio somewhere on PATH, for unit tests @rem if "%ARROW_S3%" == "ON" ( - appveyor DownloadFile https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2024-09-13T20-26-02Z -FileName C:\Windows\Minio.exe || exit /B + appveyor DownloadFile https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2025-01-20T14-49-07Z -FileName C:\Windows\Minio.exe || exit /B ) @rem diff --git a/ci/conda_env_python.txt b/ci/conda_env_python.txt index bf915493de3..ed595d05eee 100644 --- a/ci/conda_env_python.txt +++ b/ci/conda_env_python.txt @@ -15,8 +15,10 @@ # specific language governing permissions and limitations # under the License. -# don't add pandas here, because it is not a mandatory test dependency -boto3 # not a direct dependency of s3fs, but needed for our s3fs fixture +# Don't add pandas here, because it is not a mandatory test dependency + +# Not a direct dependency of s3fs, but needed for our s3fs fixture +boto3 cffi cython>=0.29.31 cloudpickle diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 792df1376d9..2651f3e9f02 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=19.0.0 +pkgver=19.0.1 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") diff --git a/ci/scripts/install_minio.sh b/ci/scripts/install_minio.sh index 6f9701ab5a1..8685ced0bd1 100755 --- a/ci/scripts/install_minio.sh +++ b/ci/scripts/install_minio.sh @@ -63,7 +63,7 @@ if [ "${version}" != "latest" ]; then fi # Use specific versions for minio server and client to avoid CI failures on new releases. -minio_version="minio.RELEASE.2024-09-13T20-26-02Z" +minio_version="minio.RELEASE.2025-01-20T14-49-07Z" mc_version="mc.RELEASE.2024-09-16T17-43-14Z" download() diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 67189526eff..b2a2731636a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -79,7 +79,7 @@ if(POLICY CMP0170) cmake_policy(SET CMP0170 NEW) endif() -set(ARROW_VERSION "19.0.0") +set(ARROW_VERSION "19.0.1") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") diff --git a/cpp/src/arrow/compute/key_map_internal.cc b/cpp/src/arrow/compute/key_map_internal.cc index f134c914553..ad264533bff 100644 --- a/cpp/src/arrow/compute/key_map_internal.cc +++ b/cpp/src/arrow/compute/key_map_internal.cc @@ -254,9 +254,9 @@ void SwissTable::early_filter_imp(const int num_keys, const uint32_t* hashes, // Extract from hash: block index and stamp // uint32_t hash = hashes[i]; - uint32_t iblock = hash >> (bits_hash_ - bits_stamp_ - log_blocks_); + uint32_t iblock = hash >> bits_shift_for_block_and_stamp_; uint32_t stamp = iblock & stamp_mask; - iblock >>= bits_stamp_; + iblock >>= bits_shift_for_block_; uint32_t num_block_bytes = num_groupid_bits + 8; const uint8_t* blockbase = @@ -399,7 +399,7 @@ bool SwissTable::find_next_stamp_match(const uint32_t hash, const uint32_t in_sl const uint64_t num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_); constexpr uint64_t stamp_mask = 0x7f; const int stamp = - static_cast((hash >> (bits_hash_ - log_blocks_ - bits_stamp_)) & stamp_mask); + static_cast((hash >> bits_shift_for_block_and_stamp_) & stamp_mask); uint64_t start_slot_id = wrap_global_slot_id(in_slot_id); int match_found; int local_slot; @@ -659,6 +659,9 @@ Status SwissTable::grow_double() { int num_group_id_bits_after = num_groupid_bits_from_log_blocks(log_blocks_ + 1); uint64_t group_id_mask_before = ~0ULL >> (64 - num_group_id_bits_before); int log_blocks_after = log_blocks_ + 1; + int bits_shift_for_block_and_stamp_after = + ComputeBitsShiftForBlockAndStamp(log_blocks_after); + int bits_shift_for_block_after = ComputeBitsShiftForBlock(log_blocks_after); uint64_t block_size_before = (8 + num_group_id_bits_before); uint64_t block_size_after = (8 + num_group_id_bits_after); uint64_t block_size_total_after = (block_size_after << log_blocks_after) + padding_; @@ -701,8 +704,7 @@ Status SwissTable::grow_double() { } int ihalf = block_id_new & 1; - uint8_t stamp_new = - hash >> ((bits_hash_ - log_blocks_after - bits_stamp_)) & stamp_mask; + uint8_t stamp_new = (hash >> bits_shift_for_block_and_stamp_after) & stamp_mask; uint64_t group_id_bit_offs = j * num_group_id_bits_before; uint64_t group_id = (util::SafeLoadAs(block_base + 8 + (group_id_bit_offs >> 3)) >> @@ -744,8 +746,7 @@ Status SwissTable::grow_double() { (util::SafeLoadAs(block_base + 8 + (group_id_bit_offs >> 3)) >> (group_id_bit_offs & 7)) & group_id_mask_before; - uint8_t stamp_new = - hash >> ((bits_hash_ - log_blocks_after - bits_stamp_)) & stamp_mask; + uint8_t stamp_new = (hash >> bits_shift_for_block_and_stamp_after) & stamp_mask; uint8_t* block_base_new = blocks_new->mutable_data() + block_id_new * block_size_after; @@ -773,6 +774,8 @@ Status SwissTable::grow_double() { blocks_ = std::move(blocks_new); hashes_ = std::move(hashes_new_buffer); log_blocks_ = log_blocks_after; + bits_shift_for_block_and_stamp_ = bits_shift_for_block_and_stamp_after; + bits_shift_for_block_ = bits_shift_for_block_after; return Status::OK(); } @@ -784,6 +787,8 @@ Status SwissTable::init(int64_t hardware_flags, MemoryPool* pool, int log_blocks log_minibatch_ = util::MiniBatch::kLogMiniBatchLength; log_blocks_ = log_blocks; + bits_shift_for_block_and_stamp_ = ComputeBitsShiftForBlockAndStamp(log_blocks_); + bits_shift_for_block_ = ComputeBitsShiftForBlock(log_blocks_); int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_); num_inserted_ = 0; @@ -820,6 +825,8 @@ void SwissTable::cleanup() { hashes_ = nullptr; } log_blocks_ = 0; + bits_shift_for_block_and_stamp_ = ComputeBitsShiftForBlockAndStamp(log_blocks_); + bits_shift_for_block_ = ComputeBitsShiftForBlock(log_blocks_); num_inserted_ = 0; } diff --git a/cpp/src/arrow/compute/key_map_internal.h b/cpp/src/arrow/compute/key_map_internal.h index a5e784a9e44..66a9957006d 100644 --- a/cpp/src/arrow/compute/key_map_internal.h +++ b/cpp/src/arrow/compute/key_map_internal.h @@ -203,6 +203,23 @@ class ARROW_EXPORT SwissTable { // Resize large hash tables when 75% full. Status grow_double(); + // When log_blocks is greater than 25, there will be overlapping bits between block id + // and stamp within a 32-bit hash value. So we must check if this is the case when + // right shifting a hash value to retrieve block id and stamp. The following two + // functions derive the number of bits to right shift from the given log_blocks. + static int ComputeBitsShiftForBlockAndStamp(int log_blocks) { + if (ARROW_PREDICT_FALSE(log_blocks + bits_stamp_ > bits_hash_)) { + return 0; + } + return bits_hash_ - log_blocks - bits_stamp_; + } + static int ComputeBitsShiftForBlock(int log_blocks) { + if (ARROW_PREDICT_FALSE(log_blocks + bits_stamp_ > bits_hash_)) { + return bits_hash_ - log_blocks; + } + return bits_stamp_; + } + // Number of hash bits stored in slots in a block. // The highest bits of hash determine block id. // The next set of highest bits is a "stamp" stored in a slot in a block. @@ -214,6 +231,11 @@ class ARROW_EXPORT SwissTable { int log_minibatch_; // Base 2 log of the number of blocks int log_blocks_ = 0; + // The following two variables are derived from log_blocks_ as log_blocks_ changes, and + // used in tight loops to avoid calling the ComputeXXX functions (introducing a + // branching on whether log_blocks_ + bits_stamp_ > bits_hash_). + int bits_shift_for_block_and_stamp_ = ComputeBitsShiftForBlockAndStamp(log_blocks_); + int bits_shift_for_block_ = ComputeBitsShiftForBlock(log_blocks_); // Number of keys inserted into hash table uint32_t num_inserted_ = 0; @@ -271,8 +293,7 @@ void SwissTable::insert_into_empty_slot(uint32_t slot_id, uint32_t hash, constexpr uint64_t stamp_mask = 0x7f; int start_slot = (slot_id & 7); - int stamp = - static_cast((hash >> (bits_hash_ - log_blocks_ - bits_stamp_)) & stamp_mask); + int stamp = static_cast((hash >> bits_shift_for_block_and_stamp_) & stamp_mask); uint64_t block_id = slot_id >> 3; uint8_t* blockbase = blocks_->mutable_data() + num_block_bytes * block_id; diff --git a/cpp/src/arrow/compute/key_map_internal_avx2.cc b/cpp/src/arrow/compute/key_map_internal_avx2.cc index 1a16603a0fa..be54f7de639 100644 --- a/cpp/src/arrow/compute/key_map_internal_avx2.cc +++ b/cpp/src/arrow/compute/key_map_internal_avx2.cc @@ -45,10 +45,9 @@ int SwissTable::early_filter_imp_avx2_x8(const int num_hashes, const uint32_t* h // Calculate block index and hash stamp for a byte in a block // __m256i vhash = _mm256_loadu_si256(vhash_ptr + i); - __m256i vblock_id = _mm256_srlv_epi32( - vhash, _mm256_set1_epi32(bits_hash_ - bits_stamp_ - log_blocks_)); + __m256i vblock_id = _mm256_srli_epi32(vhash, bits_shift_for_block_and_stamp_); __m256i vstamp = _mm256_and_si256(vblock_id, vstamp_mask); - vblock_id = _mm256_srli_epi32(vblock_id, bits_stamp_); + vblock_id = _mm256_srli_epi32(vblock_id, bits_shift_for_block_); // We now split inputs and process 4 at a time, // in order to process 64-bit blocks @@ -301,19 +300,15 @@ int SwissTable::early_filter_imp_avx2_x32(const int num_hashes, const uint32_t* _mm256_and_si256(vhash2, _mm256_set1_epi32(0xffff0000))); vhash1 = _mm256_or_si256(_mm256_srli_epi32(vhash1, 16), _mm256_and_si256(vhash3, _mm256_set1_epi32(0xffff0000))); - __m256i vstamp_A = _mm256_and_si256( - _mm256_srlv_epi32(vhash0, _mm256_set1_epi32(16 - log_blocks_ - 7)), - _mm256_set1_epi16(0x7f)); - __m256i vstamp_B = _mm256_and_si256( - _mm256_srlv_epi32(vhash1, _mm256_set1_epi32(16 - log_blocks_ - 7)), - _mm256_set1_epi16(0x7f)); + __m256i vstamp_A = _mm256_and_si256(_mm256_srli_epi32(vhash0, 16 - log_blocks_ - 7), + _mm256_set1_epi16(0x7f)); + __m256i vstamp_B = _mm256_and_si256(_mm256_srli_epi32(vhash1, 16 - log_blocks_ - 7), + _mm256_set1_epi16(0x7f)); __m256i vstamp = _mm256_or_si256(vstamp_A, _mm256_slli_epi16(vstamp_B, 8)); - __m256i vblock_id_A = - _mm256_and_si256(_mm256_srlv_epi32(vhash0, _mm256_set1_epi32(16 - log_blocks_)), - _mm256_set1_epi16(block_id_mask)); - __m256i vblock_id_B = - _mm256_and_si256(_mm256_srlv_epi32(vhash1, _mm256_set1_epi32(16 - log_blocks_)), - _mm256_set1_epi16(block_id_mask)); + __m256i vblock_id_A = _mm256_and_si256(_mm256_srli_epi32(vhash0, 16 - log_blocks_), + _mm256_set1_epi16(block_id_mask)); + __m256i vblock_id_B = _mm256_and_si256(_mm256_srli_epi32(vhash1, 16 - log_blocks_), + _mm256_set1_epi16(block_id_mask)); __m256i vblock_id = _mm256_or_si256(vblock_id_A, _mm256_slli_epi16(vblock_id_B, 8)); // Visit all block bytes in reverse order (overwriting data on multiple matches) @@ -392,16 +387,30 @@ int SwissTable::extract_group_ids_avx2(const int num_keys, const uint32_t* hashe } else { for (int i = 0; i < num_keys / unroll; ++i) { __m256i hash = _mm256_loadu_si256(reinterpret_cast(hashes) + i); + // Extend hash and local_slot to 64-bit to compute 64-bit group id offsets to + // gather from. This is to prevent index overflow issues in GH-44513. + // NB: Use zero-extend conversion for unsigned hash. + __m256i hash_lo = _mm256_cvtepu32_epi64(_mm256_castsi256_si128(hash)); + __m256i hash_hi = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(hash, 1)); __m256i local_slot = _mm256_set1_epi64x(reinterpret_cast(local_slots)[i]); - local_slot = _mm256_shuffle_epi8( - local_slot, _mm256_setr_epi32(0x80808000, 0x80808001, 0x80808002, 0x80808003, - 0x80808004, 0x80808005, 0x80808006, 0x80808007)); - local_slot = _mm256_mullo_epi32(local_slot, _mm256_set1_epi32(byte_size)); - __m256i pos = _mm256_srlv_epi32(hash, _mm256_set1_epi32(bits_hash_ - log_blocks_)); - pos = _mm256_mullo_epi32(pos, _mm256_set1_epi32(byte_multiplier)); - pos = _mm256_add_epi32(pos, local_slot); - __m256i group_id = _mm256_i32gather_epi32(elements, pos, 1); + __m256i local_slot_lo = _mm256_shuffle_epi8( + local_slot, _mm256_setr_epi32(0x80808000, 0x80808080, 0x80808001, 0x80808080, + 0x80808002, 0x80808080, 0x80808003, 0x80808080)); + __m256i local_slot_hi = _mm256_shuffle_epi8( + local_slot, _mm256_setr_epi32(0x80808004, 0x80808080, 0x80808005, 0x80808080, + 0x80808006, 0x80808080, 0x80808007, 0x80808080)); + local_slot_lo = _mm256_mul_epu32(local_slot_lo, _mm256_set1_epi32(byte_size)); + local_slot_hi = _mm256_mul_epu32(local_slot_hi, _mm256_set1_epi32(byte_size)); + __m256i pos_lo = _mm256_srli_epi64(hash_lo, bits_hash_ - log_blocks_); + __m256i pos_hi = _mm256_srli_epi64(hash_hi, bits_hash_ - log_blocks_); + pos_lo = _mm256_mul_epu32(pos_lo, _mm256_set1_epi32(byte_multiplier)); + pos_hi = _mm256_mul_epu32(pos_hi, _mm256_set1_epi32(byte_multiplier)); + pos_lo = _mm256_add_epi64(pos_lo, local_slot_lo); + pos_hi = _mm256_add_epi64(pos_hi, local_slot_hi); + __m128i group_id_lo = _mm256_i64gather_epi32(elements, pos_lo, 1); + __m128i group_id_hi = _mm256_i64gather_epi32(elements, pos_hi, 1); + __m256i group_id = _mm256_set_m128i(group_id_hi, group_id_lo); group_id = _mm256_and_si256(group_id, _mm256_set1_epi32(mask)); _mm256_storeu_si256(reinterpret_cast<__m256i*>(out_group_ids) + i, group_id); } diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index b6a928ecdd3..773ef84d240 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -1983,27 +1983,33 @@ class ObjectOutputStream final : public io::OutputStream { const void* data, int64_t nbytes, std::shared_ptr owned_buffer = nullptr) { req.SetBucket(ToAwsString(path_.bucket)); req.SetKey(ToAwsString(path_.key)); - req.SetBody(std::make_shared(data, nbytes)); req.SetContentLength(nbytes); RETURN_NOT_OK(SetSSECustomerKey(&req, sse_customer_key_)); if (!background_writes_) { - req.SetBody(std::make_shared(data, nbytes)); + // GH-45304: avoid setting a body stream if length is 0. + // This workaround can be removed once we require AWS SDK 1.11.489 or later. + if (nbytes != 0) { + req.SetBody(std::make_shared(data, nbytes)); + } ARROW_ASSIGN_OR_RAISE(auto outcome, TriggerUploadRequest(req, holder_)); RETURN_NOT_OK(sync_result_callback(req, upload_state_, part_number_, outcome)); } else { - // If the data isn't owned, make an immutable copy for the lifetime of the closure - if (owned_buffer == nullptr) { - ARROW_ASSIGN_OR_RAISE(owned_buffer, AllocateBuffer(nbytes, io_context_.pool())); - memcpy(owned_buffer->mutable_data(), data, nbytes); - } else { - DCHECK_EQ(data, owned_buffer->data()); - DCHECK_EQ(nbytes, owned_buffer->size()); + // (GH-45304: avoid setting a body stream if length is 0, see above) + if (nbytes != 0) { + // If the data isn't owned, make an immutable copy for the lifetime of the closure + if (owned_buffer == nullptr) { + ARROW_ASSIGN_OR_RAISE(owned_buffer, AllocateBuffer(nbytes, io_context_.pool())); + memcpy(owned_buffer->mutable_data(), data, nbytes); + } else { + DCHECK_EQ(data, owned_buffer->data()); + DCHECK_EQ(nbytes, owned_buffer->size()); + } + req.SetBody(std::make_shared(owned_buffer->data(), + owned_buffer->size())); } - req.SetBody( - std::make_shared(owned_buffer->data(), owned_buffer->size())); { std::unique_lock lock(upload_state_->mutex); @@ -2345,7 +2351,6 @@ class S3FileSystem::Impl : public std::enable_shared_from_this("")); return OutcomeToStatus( std::forward_as_tuple("When creating key '", key, "' in bucket '", bucket, "': "), "PutObject", client_lock.Move()->PutObject(req)); diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 3082ecb7843..370f3b26852 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -526,7 +526,6 @@ class TestS3FS : public S3TestMixin { Aws::S3::Model::PutObjectRequest req; req.SetBucket(ToAwsString("bucket")); req.SetKey(ToAwsString("emptydir/")); - req.SetBody(std::make_shared("")); RETURN_NOT_OK(OutcomeToStatus("PutObject", client_->PutObject(req))); // NOTE: no need to create intermediate "directories" somedir/ and // somedir/subdir/ diff --git a/cpp/src/arrow/flight/test_definitions.h b/cpp/src/arrow/flight/test_definitions.h index 1e0e8c209ac..1391ffc40bd 100644 --- a/cpp/src/arrow/flight/test_definitions.h +++ b/cpp/src/arrow/flight/test_definitions.h @@ -306,12 +306,13 @@ class ARROW_FLIGHT_EXPORT AsyncClientTest : public FlightTest { std::unique_ptr server_; }; +// DISABLED TestListenerLifetime: https://github.com/apache/arrow/issues/45120 #define ARROW_FLIGHT_TEST_ASYNC_CLIENT(FIXTURE) \ static_assert(std::is_base_of::value, \ ARROW_STRINGIFY(FIXTURE) " must inherit from AsyncClientTest"); \ TEST_F(FIXTURE, TestGetFlightInfo) { TestGetFlightInfo(); } \ TEST_F(FIXTURE, TestGetFlightInfoFuture) { TestGetFlightInfoFuture(); } \ - TEST_F(FIXTURE, TestListenerLifetime) { TestListenerLifetime(); } + TEST_F(FIXTURE, DISABLED_TestListenerLifetime) { TestListenerLifetime(); } } // namespace flight } // namespace arrow diff --git a/cpp/src/arrow/util/basic_decimal.cc b/cpp/src/arrow/util/basic_decimal.cc index 22db1e70519..e82078055f3 100644 --- a/cpp/src/arrow/util/basic_decimal.cc +++ b/cpp/src/arrow/util/basic_decimal.cc @@ -50,6 +50,11 @@ static constexpr uint64_t kInt64Mask = 0xFFFFFFFFFFFFFFFF; static constexpr uint64_t kInt32Mask = 0xFFFFFFFF; #endif +BasicDecimal32& BasicDecimal32::Negate() { + value_ = arrow::internal::SafeSignedNegate(value_); + return *this; +} + DecimalStatus BasicDecimal32::Divide(const BasicDecimal32& divisor, BasicDecimal32* result, BasicDecimal32* remainder) const { @@ -152,6 +157,11 @@ BasicDecimal32::operator BasicDecimal64() const { return BasicDecimal64(static_cast(value())); } +BasicDecimal64& BasicDecimal64::Negate() { + value_ = arrow::internal::SafeSignedNegate(value_); + return *this; +} + DecimalStatus BasicDecimal64::Divide(const BasicDecimal64& divisor, BasicDecimal64* result, BasicDecimal64* remainder) const { @@ -253,12 +263,18 @@ const BasicDecimal64& BasicDecimal64::GetHalfScaleMultiplier(int32_t scale) { bool BasicDecimal32::FitsInPrecision(int32_t precision) const { DCHECK_GE(precision, 0); DCHECK_LE(precision, kMaxPrecision); + if (value_ == INT32_MIN) { + return false; + } return Abs(*this) < DecimalTraits::powers_of_ten()[precision]; } bool BasicDecimal64::FitsInPrecision(int32_t precision) const { DCHECK_GE(precision, 0); DCHECK_LE(precision, kMaxPrecision); + if (value_ == INT64_MIN) { + return false; + } return Abs(*this) < DecimalTraits::powers_of_ten()[precision]; } diff --git a/cpp/src/arrow/util/basic_decimal.h b/cpp/src/arrow/util/basic_decimal.h index b5404bb7bc6..638c4870f1d 100644 --- a/cpp/src/arrow/util/basic_decimal.h +++ b/cpp/src/arrow/util/basic_decimal.h @@ -276,10 +276,7 @@ class ARROW_EXPORT BasicDecimal32 : public SmallBasicDecimal { using ValueType = int32_t; /// \brief Negate the current value (in-place) - BasicDecimal32& Negate() { - value_ = -value_; - return *this; - } + BasicDecimal32& Negate(); /// \brief Absolute value (in-place) BasicDecimal32& Abs() { return *this < 0 ? Negate() : *this; } @@ -429,10 +426,7 @@ class ARROW_EXPORT BasicDecimal64 : public SmallBasicDecimal { using ValueType = int64_t; /// \brief Negate the current value (in-place) - BasicDecimal64& Negate() { - value_ = -value_; - return *this; - } + BasicDecimal64& Negate(); /// \brief Absolute value (in-place) BasicDecimal64& Abs() { return *this < 0 ? Negate() : *this; } diff --git a/cpp/src/arrow/util/decimal_test.cc b/cpp/src/arrow/util/decimal_test.cc index d2f8ae3b7aa..e0aa0b2b85a 100644 --- a/cpp/src/arrow/util/decimal_test.cc +++ b/cpp/src/arrow/util/decimal_test.cc @@ -219,6 +219,28 @@ class DecimalFromStringTest : public ::testing::Test { } }; +TEST(Decimal32Test, TestIntMinNegate) { + Decimal32 d(INT32_MIN); + auto neg = d.Negate(); + ASSERT_EQ(neg, Decimal32(arrow::internal::SafeSignedNegate(INT32_MIN))); +} + +TEST(Decimal32Test, TestIntMinFitsPrecision) { + Decimal32 d(INT32_MIN); + ASSERT_FALSE(d.FitsInPrecision(9)); +} + +TEST(Decimal64Test, TestIntMinNegate) { + Decimal64 d(INT64_MIN); + auto neg = d.Negate(); + ASSERT_EQ(neg, Decimal64(arrow::internal::SafeSignedNegate(INT64_MIN))); +} + +TEST(Decimal64Test, TestIntMinFitsPrecision) { + Decimal64 d(INT64_MIN); + ASSERT_FALSE(d.FitsInPrecision(18)); +} + TYPED_TEST_SUITE(DecimalFromStringTest, DecimalTypes); TYPED_TEST(DecimalFromStringTest, Basics) { this->TestBasics(); } diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 856c032c358..b84e4a41f13 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -4296,6 +4296,108 @@ TEST(TestArrowReaderAdHoc, ReadFloat16Files) { } } +TEST(TestArrowFileReader, RecordBatchReaderEmptyRowGroups) { + const int num_columns = 1; + const int num_rows = 3; + const int num_chunks = 1; + + std::shared_ptr table; + ASSERT_NO_FATAL_FAILURE(MakeDoubleTable(num_columns, num_rows, num_chunks, &table)); + + const int64_t row_group_size = num_rows; + std::shared_ptr buffer; + ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, row_group_size, + default_arrow_writer_properties(), &buffer)); + + auto reader = ParquetFileReader::Open(std::make_shared(buffer)); + std::unique_ptr file_reader; + ASSERT_OK( + FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &file_reader)); + // This is the important part in this test. + std::vector row_group_indices = {}; + ASSERT_OK_AND_ASSIGN(auto record_batch_reader, + file_reader->GetRecordBatchReader(row_group_indices)); + std::shared_ptr<::arrow::RecordBatch> record_batch; + ASSERT_OK(record_batch_reader->ReadNext(&record_batch)); + // No read record batch for empty row groups request. + ASSERT_FALSE(record_batch); +} + +TEST(TestArrowFileReader, RecordBatchReaderEmptyInput) { + const int num_columns = 1; + // This is the important part in this test. + const int num_rows = 0; + const int num_chunks = 1; + + std::shared_ptr
table; + ASSERT_NO_FATAL_FAILURE(MakeDoubleTable(num_columns, num_rows, num_chunks, &table)); + + const int64_t row_group_size = num_rows; + std::shared_ptr buffer; + ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, row_group_size, + default_arrow_writer_properties(), &buffer)); + + auto reader = ParquetFileReader::Open(std::make_shared(buffer)); + std::unique_ptr file_reader; + ASSERT_OK( + FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &file_reader)); + ASSERT_OK_AND_ASSIGN(auto record_batch_reader, file_reader->GetRecordBatchReader()); + std::shared_ptr<::arrow::RecordBatch> record_batch; + ASSERT_OK(record_batch_reader->ReadNext(&record_batch)); + // No read record batch for empty data. + ASSERT_FALSE(record_batch); +} + +TEST(TestArrowColumnReader, NextBatchZeroBatchSize) { + const int num_columns = 1; + const int num_rows = 3; + const int num_chunks = 1; + + std::shared_ptr
table; + ASSERT_NO_FATAL_FAILURE(MakeDoubleTable(num_columns, num_rows, num_chunks, &table)); + + const int64_t row_group_size = num_rows; + std::shared_ptr buffer; + ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, row_group_size, + default_arrow_writer_properties(), &buffer)); + + auto reader = ParquetFileReader::Open(std::make_shared(buffer)); + std::unique_ptr file_reader; + ASSERT_OK( + FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &file_reader)); + std::unique_ptr column_reader; + ASSERT_OK(file_reader->GetColumn(0, &column_reader)); + std::shared_ptr chunked_array; + // This is the important part in this test. + ASSERT_OK(column_reader->NextBatch(0, &chunked_array)); + ASSERT_EQ(0, chunked_array->length()); +} + +TEST(TestArrowColumnReader, NextBatchEmptyInput) { + const int num_columns = 1; + // This is the important part in this test. + const int num_rows = 0; + const int num_chunks = 1; + + std::shared_ptr
table; + ASSERT_NO_FATAL_FAILURE(MakeDoubleTable(num_columns, num_rows, num_chunks, &table)); + + const int64_t row_group_size = num_rows; + std::shared_ptr buffer; + ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, row_group_size, + default_arrow_writer_properties(), &buffer)); + + auto reader = ParquetFileReader::Open(std::make_shared(buffer)); + std::unique_ptr file_reader; + ASSERT_OK( + FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &file_reader)); + std::unique_ptr column_reader; + ASSERT_OK(file_reader->GetColumn(0, &column_reader)); + std::shared_ptr chunked_array; + ASSERT_OK(column_reader->NextBatch(10, &chunked_array)); + ASSERT_EQ(0, chunked_array->length()); +} + // direct-as-possible translation of // pyarrow/tests/test_parquet.py::test_validate_schema_write_table TEST(TestArrowWriterAdHoc, SchemaMismatch) { diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc b/cpp/src/parquet/arrow/arrow_statistics_test.cc index a8e2287d370..048518644c6 100644 --- a/cpp/src/parquet/arrow/arrow_statistics_test.cc +++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc @@ -185,16 +185,17 @@ TEST(StatisticsTest, TruncateOnlyHalfMinMax) { namespace { ::arrow::Result> StatisticsReadArray( - std::shared_ptr<::arrow::DataType> data_type, std::shared_ptr<::arrow::Array> array) { + std::shared_ptr<::arrow::DataType> data_type, std::shared_ptr<::arrow::Array> array, + std::shared_ptr writer_properties = default_writer_properties(), + const ArrowReaderProperties& reader_properties = default_arrow_reader_properties()) { auto schema = ::arrow::schema({::arrow::field("column", data_type)}); auto record_batch = ::arrow::RecordBatch::Make(schema, array->length(), {array}); ARROW_ASSIGN_OR_RAISE(auto sink, ::arrow::io::BufferOutputStream::Create()); const auto arrow_writer_properties = parquet::ArrowWriterProperties::Builder().store_schema()->build(); - ARROW_ASSIGN_OR_RAISE( - auto writer, - FileWriter::Open(*schema, ::arrow::default_memory_pool(), sink, - default_writer_properties(), arrow_writer_properties)); + ARROW_ASSIGN_OR_RAISE(auto writer, + FileWriter::Open(*schema, ::arrow::default_memory_pool(), sink, + writer_properties, arrow_writer_properties)); ARROW_RETURN_NOT_OK(writer->WriteRecordBatch(*record_batch)); ARROW_RETURN_NOT_OK(writer->Close()); ARROW_ASSIGN_OR_RAISE(auto buffer, sink->Finish()); @@ -202,8 +203,8 @@ ::arrow::Result> StatisticsReadArray( auto reader = ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer)); std::unique_ptr file_reader; - ARROW_RETURN_NOT_OK( - FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &file_reader)); + ARROW_RETURN_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader), + reader_properties, &file_reader)); std::shared_ptr<::arrow::ChunkedArray> chunked_array; ARROW_RETURN_NOT_OK(file_reader->ReadColumn(0, &chunked_array)); return chunked_array->chunk(0); @@ -326,4 +327,44 @@ TEST(TestStatisticsRead, Duration) { ::arrow::duration(::arrow::TimeUnit::NANO)); } +TEST(TestStatisticsRead, MultipleRowGroupsDefault) { + auto arrow_type = ::arrow::int32(); + auto built_array = ArrayFromJSON(arrow_type, R"([1, null, -1])"); + auto writer_properties = WriterProperties::Builder().max_row_group_length(2)->build(); + ASSERT_OK_AND_ASSIGN( + auto read_array, + StatisticsReadArray(arrow_type, std::move(built_array), writer_properties)); + auto typed_read_array = std::static_pointer_cast<::arrow::Int32Array>(read_array); + auto statistics = typed_read_array->statistics(); + ASSERT_EQ(nullptr, statistics); +} + +TEST(TestStatisticsRead, MultipleRowGroupsShouldLoadStatistics) { + auto arrow_type = ::arrow::int32(); + auto built_array = ArrayFromJSON(arrow_type, R"([1, null, -1])"); + auto writer_properties = WriterProperties::Builder().max_row_group_length(2)->build(); + ArrowReaderProperties reader_properties; + reader_properties.set_should_load_statistics(true); + ASSERT_OK_AND_ASSIGN(auto read_array, + StatisticsReadArray(arrow_type, std::move(built_array), + writer_properties, reader_properties)); + // If we use should_load_statistics, reader doesn't load multiple + // row groups at once. So the first array in the read chunked array + // has only 2 elements. + ASSERT_EQ(2, read_array->length()); + auto typed_read_array = std::static_pointer_cast<::arrow::Int32Array>(read_array); + auto statistics = typed_read_array->statistics(); + ASSERT_NE(nullptr, statistics); + ASSERT_EQ(true, statistics->null_count.has_value()); + ASSERT_EQ(1, statistics->null_count.value()); + ASSERT_EQ(false, statistics->distinct_count.has_value()); + ASSERT_EQ(true, statistics->min.has_value()); + // This is not -1 because this array has only the first 2 elements. + ASSERT_EQ(1, std::get(*statistics->min)); + ASSERT_EQ(true, statistics->is_min_exact); + ASSERT_EQ(true, statistics->max.has_value()); + ASSERT_EQ(1, std::get(*statistics->max)); + ASSERT_EQ(true, statistics->is_max_exact); +} + } // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 465ad5844d3..03b725beb2a 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -218,6 +218,7 @@ class FileReaderImpl : public FileReader { ctx->iterator_factory = SomeRowGroupsFactory(row_groups); ctx->filter_leaves = true; ctx->included_leaves = included_leaves; + ctx->reader_properties = &reader_properties_; return GetReader(manifest_.schema_fields[i], ctx, out); } @@ -475,6 +476,8 @@ class LeafReader : public ColumnReaderImpl { record_reader_->Reset(); // Pre-allocation gives much better performance for flat columns record_reader_->Reserve(records_to_read); + const bool should_load_statistics = ctx_->reader_properties->should_load_statistics(); + int64_t num_target_row_groups = 0; while (records_to_read > 0) { if (!record_reader_->HasMoreData()) { break; @@ -483,11 +486,21 @@ class LeafReader : public ColumnReaderImpl { records_to_read -= records_read; if (records_read == 0) { NextRowGroup(); + } else { + num_target_row_groups++; + // We can't mix multiple row groups when we load statistics + // because statistics are associated with a row group. If we + // want to mix multiple row groups and keep valid statistics, + // we need to implement a statistics merge logic. + if (should_load_statistics) { + break; + } } } - RETURN_NOT_OK(TransferColumnData(record_reader_.get(), - input_->column_chunk_metadata(), field_, descr_, - ctx_.get(), &out_)); + RETURN_NOT_OK(TransferColumnData( + record_reader_.get(), + num_target_row_groups == 1 ? input_->column_chunk_metadata() : nullptr, field_, + descr_, ctx_.get(), &out_)); return Status::OK(); END_PARQUET_CATCH_EXCEPTIONS } @@ -1214,6 +1227,7 @@ Status FileReaderImpl::GetColumn(int i, FileColumnIteratorFactory iterator_facto ctx->pool = pool_; ctx->iterator_factory = iterator_factory; ctx->filter_leaves = false; + ctx->reader_properties = &reader_properties_; std::unique_ptr result; RETURN_NOT_OK(GetReader(manifest_.schema_fields[i], ctx, &result)); *out = std::move(result); diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 9d3171ea1a9..59fe2b46002 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -322,6 +322,10 @@ template void AttachStatistics(::arrow::ArrayData* data, std::unique_ptr<::parquet::ColumnChunkMetaData> metadata, const ReaderContext* ctx) { + if (!metadata) { + return; + } + using ArrowCType = typename ArrowType::c_type; auto statistics = metadata->statistics().get(); diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h index fab56c88804..4ee3bf98bc5 100644 --- a/cpp/src/parquet/arrow/reader_internal.h +++ b/cpp/src/parquet/arrow/reader_internal.h @@ -117,6 +117,7 @@ struct ReaderContext { FileColumnIteratorFactory iterator_factory; bool filter_leaves; std::shared_ptr> included_leaves; + ArrowReaderProperties* reader_properties; bool IncludesLeaf(int leaf_index) const { if (this->filter_leaves) { diff --git a/cpp/src/parquet/arrow/test_util.h b/cpp/src/parquet/arrow/test_util.h index c8fcbbb65d1..cfc57ce6ea7 100644 --- a/cpp/src/parquet/arrow/test_util.h +++ b/cpp/src/parquet/arrow/test_util.h @@ -229,7 +229,9 @@ ::arrow::enable_if_floating_point NullableArray( } ::arrow::NumericBuilder builder; - RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data())); + if (values.size() > 0) { + RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data())); + } return builder.Finish(out); } diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 12cbcf20aff..4998e6f301a 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1468,42 +1468,43 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< // which case we call back to the dense write path) std::shared_ptr<::arrow::Array> preserved_dictionary_; - int64_t WriteLevels(int64_t num_values, const int16_t* def_levels, + int64_t WriteLevels(int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels) { + // Update histograms now, to maximize cache efficiency. + UpdateLevelHistogram(num_levels, def_levels, rep_levels); + int64_t values_to_write = 0; // If the field is required and non-repeated, there are no definition levels if (descr_->max_definition_level() > 0) { - for (int64_t i = 0; i < num_values; ++i) { + for (int64_t i = 0; i < num_levels; ++i) { if (def_levels[i] == descr_->max_definition_level()) { ++values_to_write; } } - WriteDefinitionLevels(num_values, def_levels); + WriteDefinitionLevels(num_levels, def_levels); } else { // Required field, write all values - values_to_write = num_values; + values_to_write = num_levels; } // Not present for non-repeated fields if (descr_->max_repetition_level() > 0) { // A row could include more than one value // Count the occasions where we start a new row - for (int64_t i = 0; i < num_values; ++i) { + for (int64_t i = 0; i < num_levels; ++i) { if (rep_levels[i] == 0) { rows_written_++; num_buffered_rows_++; } } - WriteRepetitionLevels(num_values, rep_levels); + WriteRepetitionLevels(num_levels, rep_levels); } else { // Each value is exactly one row - rows_written_ += num_values; - num_buffered_rows_ += num_values; + rows_written_ += num_levels; + num_buffered_rows_ += num_levels; } - - UpdateLevelHistogram(num_values, def_levels, rep_levels); return values_to_write; } @@ -1575,6 +1576,9 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< void WriteLevelsSpaced(int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels) { + // Update histograms now, to maximize cache efficiency. + UpdateLevelHistogram(num_levels, def_levels, rep_levels); + // If the field is required and non-repeated, there are no definition levels if (descr_->max_definition_level() > 0) { WriteDefinitionLevels(num_levels, def_levels); @@ -1595,8 +1599,6 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< rows_written_ += num_levels; num_buffered_rows_ += num_levels; } - - UpdateLevelHistogram(num_levels, def_levels, rep_levels); } void UpdateLevelHistogram(int64_t num_levels, const int16_t* def_levels, @@ -1606,26 +1608,20 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< } auto add_levels = [](std::vector& level_histogram, - ::arrow::util::span levels) { - for (int16_t level : levels) { - ARROW_DCHECK_LT(level, static_cast(level_histogram.size())); - ++level_histogram[level]; + ::arrow::util::span levels, int16_t max_level) { + if (max_level == 0) { + return; } + ARROW_DCHECK_EQ(static_cast(max_level) + 1, level_histogram.size()); + ::parquet::UpdateLevelHistogram(levels, level_histogram); }; - if (descr_->max_definition_level() > 0) { - add_levels(page_size_statistics_->definition_level_histogram, - {def_levels, static_cast(num_levels)}); - } else { - page_size_statistics_->definition_level_histogram[0] += num_levels; - } - - if (descr_->max_repetition_level() > 0) { - add_levels(page_size_statistics_->repetition_level_histogram, - {rep_levels, static_cast(num_levels)}); - } else { - page_size_statistics_->repetition_level_histogram[0] += num_levels; - } + add_levels(page_size_statistics_->definition_level_histogram, + {def_levels, static_cast(num_levels)}, + descr_->max_definition_level()); + add_levels(page_size_statistics_->repetition_level_histogram, + {rep_levels, static_cast(num_levels)}, + descr_->max_repetition_level()); } // Update the unencoded data bytes for ByteArray only per the specification. diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index edaf28cd92a..46f3ae378dc 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -911,7 +911,8 @@ class PARQUET_EXPORT ArrowReaderProperties { pre_buffer_(true), cache_options_(::arrow::io::CacheOptions::LazyDefaults()), coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO), - arrow_extensions_enabled_(false) {} + arrow_extensions_enabled_(false), + should_load_statistics_(false) {} /// \brief Set whether to use the IO thread pool to parse columns in parallel. /// @@ -994,6 +995,15 @@ class PARQUET_EXPORT ArrowReaderProperties { } bool get_arrow_extensions_enabled() const { return arrow_extensions_enabled_; } + /// \brief Set whether to load statistics as much as possible. + /// + /// Default is false. + void set_should_load_statistics(bool should_load_statistics) { + should_load_statistics_ = should_load_statistics; + } + /// Return whether loading statistics as much as possible. + bool should_load_statistics() const { return should_load_statistics_; } + private: bool use_threads_; std::unordered_set read_dict_indices_; @@ -1003,6 +1013,7 @@ class PARQUET_EXPORT ArrowReaderProperties { ::arrow::io::CacheOptions cache_options_; ::arrow::TimeUnit::type coerce_int96_timestamp_unit_; bool arrow_extensions_enabled_; + bool should_load_statistics_; }; /// EXPERIMENTAL: Constructs the default ArrowReaderProperties diff --git a/cpp/src/parquet/size_statistics.cc b/cpp/src/parquet/size_statistics.cc index a02cef7aba4..1ce6c937ad5 100644 --- a/cpp/src/parquet/size_statistics.cc +++ b/cpp/src/parquet/size_statistics.cc @@ -18,6 +18,9 @@ #include "parquet/size_statistics.h" #include +#include +#include +#include #include "arrow/util/logging.h" #include "parquet/exception.h" @@ -25,6 +28,17 @@ namespace parquet { +namespace { + +void MergeLevelHistogram(::arrow::util::span histogram, + ::arrow::util::span other) { + ARROW_DCHECK_EQ(histogram.size(), other.size()); + std::transform(histogram.begin(), histogram.end(), other.begin(), histogram.begin(), + std::plus<>()); +} + +} // namespace + void SizeStatistics::Merge(const SizeStatistics& other) { if (repetition_level_histogram.size() != other.repetition_level_histogram.size()) { throw ParquetException("Repetition level histogram size mismatch"); @@ -36,12 +50,8 @@ void SizeStatistics::Merge(const SizeStatistics& other) { other.unencoded_byte_array_data_bytes.has_value()) { throw ParquetException("Unencoded byte array data bytes are not consistent"); } - std::transform(repetition_level_histogram.begin(), repetition_level_histogram.end(), - other.repetition_level_histogram.begin(), - repetition_level_histogram.begin(), std::plus<>()); - std::transform(definition_level_histogram.begin(), definition_level_histogram.end(), - other.definition_level_histogram.begin(), - definition_level_histogram.begin(), std::plus<>()); + MergeLevelHistogram(repetition_level_histogram, other.repetition_level_histogram); + MergeLevelHistogram(definition_level_histogram, other.definition_level_histogram); if (unencoded_byte_array_data_bytes.has_value()) { unencoded_byte_array_data_bytes = unencoded_byte_array_data_bytes.value() + other.unencoded_byte_array_data_bytes.value(); @@ -54,23 +64,28 @@ void SizeStatistics::IncrementUnencodedByteArrayDataBytes(int64_t value) { } void SizeStatistics::Validate(const ColumnDescriptor* descr) const { - if (repetition_level_histogram.size() != - static_cast(descr->max_repetition_level() + 1)) { - throw ParquetException("Repetition level histogram size mismatch"); - } - if (definition_level_histogram.size() != - static_cast(descr->max_definition_level() + 1)) { - throw ParquetException("Definition level histogram size mismatch"); - } + auto validate_histogram = [](const std::vector& histogram, int16_t max_level, + const std::string& name) { + if (histogram.empty()) { + // A levels histogram is always allowed to be missing. + return; + } + if (histogram.size() != static_cast(max_level + 1)) { + std::stringstream ss; + ss << name << " level histogram size mismatch, size: " << histogram.size() + << ", expected: " << (max_level + 1); + throw ParquetException(ss.str()); + } + }; + validate_histogram(repetition_level_histogram, descr->max_repetition_level(), + "Repetition"); + validate_histogram(definition_level_histogram, descr->max_definition_level(), + "Definition"); if (unencoded_byte_array_data_bytes.has_value() && descr->physical_type() != Type::BYTE_ARRAY) { throw ParquetException("Unencoded byte array data bytes does not support " + TypeToString(descr->physical_type())); } - if (!unencoded_byte_array_data_bytes.has_value() && - descr->physical_type() == Type::BYTE_ARRAY) { - throw ParquetException("Missing unencoded byte array data bytes"); - } } void SizeStatistics::Reset() { @@ -83,12 +98,103 @@ void SizeStatistics::Reset() { std::unique_ptr SizeStatistics::Make(const ColumnDescriptor* descr) { auto size_stats = std::make_unique(); - size_stats->repetition_level_histogram.resize(descr->max_repetition_level() + 1, 0); - size_stats->definition_level_histogram.resize(descr->max_definition_level() + 1, 0); + // If the max level is 0, the level histogram can be omitted because it contains + // only single level (a.k.a. 0) and its count is equivalent to `num_values` of the + // column chunk or data page. + if (descr->max_repetition_level() != 0) { + size_stats->repetition_level_histogram.resize(descr->max_repetition_level() + 1, 0); + } + if (descr->max_definition_level() != 0) { + size_stats->definition_level_histogram.resize(descr->max_definition_level() + 1, 0); + } if (descr->physical_type() == Type::BYTE_ARRAY) { size_stats->unencoded_byte_array_data_bytes = 0; } return size_stats; } +std::ostream& operator<<(std::ostream& os, const SizeStatistics& size_stats) { + constexpr std::string_view kComma = ", "; + os << "SizeStatistics{"; + std::string_view sep = ""; + if (size_stats.unencoded_byte_array_data_bytes.has_value()) { + os << "unencoded_byte_array_data_bytes=" + << *size_stats.unencoded_byte_array_data_bytes; + sep = kComma; + } + auto print_histogram = [&](std::string_view name, + const std::vector& histogram) { + if (!histogram.empty()) { + os << sep << name << "={"; + sep = kComma; + std::string_view value_sep = ""; + for (int64_t v : histogram) { + os << value_sep << v; + value_sep = kComma; + } + os << "}"; + } + }; + print_histogram("repetition_level_histogram", size_stats.repetition_level_histogram); + print_histogram("definition_level_histogram", size_stats.definition_level_histogram); + os << "}"; + return os; +} + +void UpdateLevelHistogram(::arrow::util::span levels, + ::arrow::util::span histogram) { + const int64_t num_levels = static_cast(levels.size()); + DCHECK_GE(histogram.size(), 1); + const int16_t max_level = static_cast(histogram.size() - 1); + if (max_level == 0) { + histogram[0] += num_levels; + return; + } + +#ifndef NDEBUG + for (auto level : levels) { + ARROW_DCHECK_LE(level, max_level); + } +#endif + + if (max_level == 1) { + // Specialize the common case for non-repeated non-nested columns. + // Summing the levels gives us the number of 1s, and the number of 0s follows. + // We do repeated sums in the int16_t space, which the compiler is likely + // to vectorize efficiently. + constexpr int64_t kChunkSize = 1 << 14; // to avoid int16_t overflows + int64_t hist1 = 0; + auto it = levels.begin(); + while (it != levels.end()) { + const auto chunk_size = std::min(levels.end() - it, kChunkSize); + hist1 += std::accumulate(levels.begin(), levels.begin() + chunk_size, int16_t{0}); + it += chunk_size; + } + histogram[0] += num_levels - hist1; + histogram[1] += hist1; + return; + } + + // The generic implementation issues a series of histogram load-stores. + // However, it limits store-to-load dependencies by interleaving partial histogram + // updates. + constexpr int kUnroll = 4; + std::array, kUnroll> partial_hist; + for (auto& hist : partial_hist) { + hist.assign(histogram.size(), 0); + } + int64_t i = 0; + for (; i <= num_levels - kUnroll; i += kUnroll) { + for (int j = 0; j < kUnroll; ++j) { + ++partial_hist[j][levels[i + j]]; + } + } + for (; i < num_levels; ++i) { + ++partial_hist[0][levels[i]]; + } + for (const auto& hist : partial_hist) { + MergeLevelHistogram(histogram, hist); + } +} + } // namespace parquet diff --git a/cpp/src/parquet/size_statistics.h b/cpp/src/parquet/size_statistics.h index c25e70ee36d..ec79b8c4f8b 100644 --- a/cpp/src/parquet/size_statistics.h +++ b/cpp/src/parquet/size_statistics.h @@ -17,9 +17,12 @@ #pragma once +#include +#include #include #include +#include "arrow/util/span.h" #include "parquet/platform.h" #include "parquet/type_fwd.h" @@ -89,4 +92,11 @@ struct PARQUET_EXPORT SizeStatistics { static std::unique_ptr Make(const ColumnDescriptor* descr); }; +PARQUET_EXPORT +std::ostream& operator<<(std::ostream&, const SizeStatistics&); + +PARQUET_EXPORT +void UpdateLevelHistogram(::arrow::util::span levels, + ::arrow::util::span histogram); + } // namespace parquet diff --git a/cpp/src/parquet/size_statistics_test.cc b/cpp/src/parquet/size_statistics_test.cc index cefd31dce28..90d6df57e7f 100644 --- a/cpp/src/parquet/size_statistics_test.cc +++ b/cpp/src/parquet/size_statistics_test.cc @@ -19,16 +19,14 @@ #include "gtest/gtest.h" #include +#include #include #include "arrow/buffer.h" #include "arrow/table.h" -#include "arrow/testing/builder.h" #include "arrow/testing/gtest_util.h" -#include "arrow/util/bit_util.h" #include "arrow/util/span.h" #include "parquet/arrow/reader.h" -#include "parquet/arrow/reader_internal.h" #include "parquet/arrow/schema.h" #include "parquet/arrow/writer.h" #include "parquet/column_writer.h" @@ -42,6 +40,29 @@ namespace parquet { +TEST(SizeStatistics, UpdateLevelHistogram) { + { + // max_level = 1 + std::vector histogram(2, 0); + UpdateLevelHistogram(std::vector{0, 1, 1, 1, 0}, histogram); + EXPECT_THAT(histogram, ::testing::ElementsAre(2, 3)); + UpdateLevelHistogram(std::vector{1, 1, 0}, histogram); + EXPECT_THAT(histogram, ::testing::ElementsAre(3, 5)); + UpdateLevelHistogram(std::vector{}, histogram); + EXPECT_THAT(histogram, ::testing::ElementsAre(3, 5)); + } + { + // max_level > 1 + std::vector histogram(3, 0); + UpdateLevelHistogram(std::vector{0, 1, 2, 2, 0}, histogram); + EXPECT_THAT(histogram, ::testing::ElementsAre(2, 1, 2)); + UpdateLevelHistogram(std::vector{1, 1, 0}, histogram); + EXPECT_THAT(histogram, ::testing::ElementsAre(3, 3, 2)); + UpdateLevelHistogram(std::vector{}, histogram); + EXPECT_THAT(histogram, ::testing::ElementsAre(3, 3, 2)); + } +} + TEST(SizeStatistics, ThriftSerDe) { const std::vector kDefLevels = {128, 64, 32, 16}; const std::vector kRepLevels = {100, 80, 60, 40, 20}; @@ -88,13 +109,38 @@ struct PageSizeStatistics { } }; +std::ostream& operator<<(std::ostream& os, const PageSizeStatistics& page_stats) { + constexpr std::string_view kComma = ", "; + os << "PageSizeStatistics{"; + std::string_view sep = ""; + auto print_vector = [&](std::string_view name, const std::vector& values) { + if (!values.empty()) { + os << sep << name << "={"; + sep = kComma; + std::string_view value_sep = ""; + for (int64_t v : values) { + os << value_sep << v; + value_sep = kComma; + } + os << "}"; + } + }; + print_vector("def_levels", page_stats.def_levels); + print_vector("rep_levels", page_stats.rep_levels); + print_vector("byte_array_bytes", page_stats.byte_array_bytes); + os << "}"; + return os; +} + class SizeStatisticsRoundTripTest : public ::testing::Test { public: - void WriteFile(SizeStatisticsLevel level, - const std::shared_ptr<::arrow::Table>& table) { + void WriteFile(SizeStatisticsLevel level, const std::shared_ptr<::arrow::Table>& table, + int max_row_group_length, int page_size, + int write_batch_size = DEFAULT_WRITE_BATCH_SIZE) { auto writer_properties = WriterProperties::Builder() - .max_row_group_length(2) /* every row group has 2 rows */ - ->data_pagesize(1) /* every page has 1 row */ + .max_row_group_length(max_row_group_length) + ->data_pagesize(page_size) + ->write_batch_size(write_batch_size) ->enable_write_page_index() ->enable_statistics() ->set_size_statistics_level(level) @@ -127,6 +173,7 @@ class SizeStatisticsRoundTripTest : public ::testing::Test { ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer_)); // Read row group size statistics in order. + row_group_stats_.clear(); auto metadata = reader->metadata(); for (int i = 0; i < metadata->num_row_groups(); ++i) { auto row_group_metadata = metadata->RowGroup(i); @@ -138,6 +185,7 @@ class SizeStatisticsRoundTripTest : public ::testing::Test { } // Read page size statistics in order. + page_stats_.clear(); auto page_index_reader = reader->GetPageIndexReader(); ASSERT_NE(page_index_reader, nullptr); @@ -168,12 +216,22 @@ class SizeStatisticsRoundTripTest : public ::testing::Test { } } - void Reset() { - buffer_.reset(); - row_group_stats_.clear(); - page_stats_.clear(); + void ReadData() { + auto reader = + ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer_)); + auto metadata = reader->metadata(); + for (int i = 0; i < metadata->num_row_groups(); ++i) { + int64_t num_rows = metadata->RowGroup(i)->num_rows(); + auto row_group_reader = reader->RowGroup(i); + for (int j = 0; j < metadata->num_columns(); ++j) { + auto column_reader = row_group_reader->RecordReader(j); + ASSERT_EQ(column_reader->ReadRecords(num_rows + 1), num_rows); + } + } } + void Reset() { buffer_.reset(); } + protected: std::shared_ptr buffer_; std::vector row_group_stats_; @@ -187,7 +245,7 @@ TEST_F(SizeStatisticsRoundTripTest, EnableSizeStats) { ::arrow::field("a", ::arrow::list(::arrow::list(::arrow::int32()))), ::arrow::field("b", ::arrow::list(::arrow::list(::arrow::utf8()))), }); - // First two rows are in one row group, and the other two rows are in another row group. + // First two rows will be in one row group, and the other two rows in another row group. auto table = ::arrow::TableFromJSON(schema, {R"([ [ [[1],[1,1],[1,1,1]], [["a"],["a","a"],["a","a","a"]] ], [ [[0,1,null]], [["foo","bar",null]] ], @@ -198,7 +256,7 @@ TEST_F(SizeStatisticsRoundTripTest, EnableSizeStats) { for (auto size_stats_level : {SizeStatisticsLevel::None, SizeStatisticsLevel::ColumnChunk, SizeStatisticsLevel::PageAndColumnChunk}) { - WriteFile(size_stats_level, table); + WriteFile(size_stats_level, table, /*max_row_group_length=*/2, /*page_size=*/1); ReadSizeStatistics(); if (size_stats_level == SizeStatisticsLevel::None) { @@ -251,29 +309,104 @@ TEST_F(SizeStatisticsRoundTripTest, WriteDictionaryArray) { {::arrow::field("a", ::arrow::dictionary(::arrow::int16(), ::arrow::utf8()))}); WriteFile( SizeStatisticsLevel::PageAndColumnChunk, - ::arrow::TableFromJSON(schema, {R"([["aa"],["aaa"],[null],["a"],["aaa"],["a"]])"})); - + ::arrow::TableFromJSON(schema, {R"([["aa"],["aaa"],[null],["a"],["aaa"],["a"]])"}), + /*max_row_group_length=*/2, /*page_size=*/1); ReadSizeStatistics(); EXPECT_THAT(row_group_stats_, ::testing::ElementsAre(SizeStatistics{/*def_levels=*/{0, 2}, - /*rep_levels=*/{2}, + /*rep_levels=*/{}, /*byte_array_bytes=*/5}, SizeStatistics{/*def_levels=*/{1, 1}, - /*rep_levels=*/{2}, + /*rep_levels=*/{}, /*byte_array_bytes=*/1}, SizeStatistics{/*def_levels=*/{0, 2}, - /*rep_levels=*/{2}, + /*rep_levels=*/{}, /*byte_array_bytes=*/4})); EXPECT_THAT(page_stats_, ::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{0, 2}, - /*rep_levels=*/{2}, + /*rep_levels=*/{}, /*byte_array_bytes=*/{5}}, PageSizeStatistics{/*def_levels=*/{1, 1}, - /*rep_levels=*/{2}, + /*rep_levels=*/{}, /*byte_array_bytes=*/{1}}, PageSizeStatistics{/*def_levels=*/{0, 2}, - /*rep_levels=*/{2}, + /*rep_levels=*/{}, /*byte_array_bytes=*/{4}})); } +TEST_F(SizeStatisticsRoundTripTest, WritePageInBatches) { + // Rep/def level histograms are updates in batches of `write_batch_size` levels + // inside a single page. Exercise the logic with more than one batch per page. + auto schema = ::arrow::schema({::arrow::field("a", ::arrow::list(::arrow::utf8()))}); + auto table = ::arrow::TableFromJSON(schema, {R"([ + [ [null,"a","ab"] ], + [ null ], + [ [] ], + [ [null,"d","de"] ], + [ ["g","gh",null] ], + [ ["j","jk",null] ] + ])"}); + for (int write_batch_size : {100, 5, 4, 3, 2, 1}) { + ARROW_SCOPED_TRACE("write_batch_size = ", write_batch_size); + WriteFile(SizeStatisticsLevel::PageAndColumnChunk, table, + /*max_row_group_length=*/1000, /*page_size=*/1000, write_batch_size); + ReadSizeStatistics(); + EXPECT_THAT(row_group_stats_, + ::testing::ElementsAre(SizeStatistics{/*def_levels=*/{1, 1, 4, 8}, + /*rep_levels=*/{6, 8}, + /*byte_array_bytes=*/12})); + EXPECT_THAT(page_stats_, + ::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{1, 1, 4, 8}, + /*rep_levels=*/{6, 8}, + /*byte_array_bytes=*/{12}})); + } +} + +TEST_F(SizeStatisticsRoundTripTest, LargePage) { + // When max_level is 1, the levels are summed in 2**30 chunks, exercise this + // by testing with a 90000 rows table; + auto schema = ::arrow::schema({::arrow::field("a", ::arrow::utf8())}); + auto seed_batch = ::arrow::RecordBatchFromJSON(schema, R"([ + [ "a" ], + [ "bc" ], + [ null ] + ])"); + ASSERT_OK_AND_ASSIGN(auto table, ::arrow::Table::FromRecordBatches( + ::arrow::RecordBatchVector(30000, seed_batch))); + ASSERT_OK_AND_ASSIGN(table, table->CombineChunks()); + ASSERT_EQ(table->num_rows(), 90000); + + WriteFile(SizeStatisticsLevel::PageAndColumnChunk, table, + /*max_row_group_length=*/1 << 30, /*page_size=*/1 << 30, + /*write_batch_size=*/50000); + ReadSizeStatistics(); + EXPECT_THAT(row_group_stats_, + ::testing::ElementsAre(SizeStatistics{/*def_levels=*/{30000, 60000}, + /*rep_levels=*/{}, + /*byte_array_bytes=*/90000})); + EXPECT_THAT(page_stats_, + ::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{30000, 60000}, + /*rep_levels=*/{}, + /*byte_array_bytes=*/{90000}})); +} + +TEST_F(SizeStatisticsRoundTripTest, MaxLevelZero) { + auto schema = + ::arrow::schema({::arrow::field("a", ::arrow::utf8(), /*nullable=*/false)}); + WriteFile(SizeStatisticsLevel::PageAndColumnChunk, + ::arrow::TableFromJSON(schema, {R"([["foo"],["bar"]])"}), + /*max_row_group_length=*/2, + /*page_size=*/1024); + ASSERT_NO_FATAL_FAILURE(ReadSizeStatistics()); + ASSERT_NO_FATAL_FAILURE(ReadData()); + EXPECT_THAT(row_group_stats_, + ::testing::ElementsAre(SizeStatistics{/*def_levels=*/{}, + /*rep_levels=*/{}, + /*byte_array_bytes=*/6})); + EXPECT_THAT(page_stats_, + ::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{}, + /*rep_levels=*/{}, + /*byte_array_bytes=*/{6}})); +} + } // namespace parquet diff --git a/cpp/vcpkg.json b/cpp/vcpkg.json index 75264a31b84..e9c87c1df0e 100644 --- a/cpp/vcpkg.json +++ b/cpp/vcpkg.json @@ -1,6 +1,6 @@ { "name": "arrow", - "version-string": "19.0.0", + "version-string": "19.0.1", "dependencies": [ "abseil", { diff --git a/csharp/Directory.Build.props b/csharp/Directory.Build.props index 993ca1a5958..5279fd8c6ea 100644 --- a/csharp/Directory.Build.props +++ b/csharp/Directory.Build.props @@ -29,7 +29,7 @@ Apache Arrow library Copyright 2016-2024 The Apache Software Foundation The Apache Software Foundation - 19.0.0 + 19.0.1 diff --git a/dev/tasks/homebrew-formulae/apache-arrow-glib.rb b/dev/tasks/homebrew-formulae/apache-arrow-glib.rb index 9c6ed8c5874..582361b5854 100644 --- a/dev/tasks/homebrew-formulae/apache-arrow-glib.rb +++ b/dev/tasks/homebrew-formulae/apache-arrow-glib.rb @@ -29,7 +29,7 @@ class ApacheArrowGlib < Formula desc "GLib bindings for Apache Arrow" homepage "https://arrow.apache.org/" - url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-19.0.0/apache-arrow-19.0.0.tar.gz" + url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-19.0.1/apache-arrow-19.0.1.tar.gz" sha256 "9948ddb6d4798b51552d0dca3252dd6e3a7d0f9702714fc6f5a1b59397ce1d28" license "Apache-2.0" head "https://github.com/apache/arrow.git", branch: "main" diff --git a/dev/tasks/homebrew-formulae/apache-arrow.rb b/dev/tasks/homebrew-formulae/apache-arrow.rb index 2400461c156..7f8e8556be5 100644 --- a/dev/tasks/homebrew-formulae/apache-arrow.rb +++ b/dev/tasks/homebrew-formulae/apache-arrow.rb @@ -29,7 +29,7 @@ class ApacheArrow < Formula desc "Columnar in-memory analytics layer designed to accelerate big data" homepage "https://arrow.apache.org/" - url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-19.0.0/apache-arrow-19.0.0.tar.gz" + url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-19.0.1/apache-arrow-19.0.1.tar.gz" sha256 "9948ddb6d4798b51552d0dca3252dd6e3a7d0f9702714fc6f5a1b59397ce1d28" license "Apache-2.0" head "https://github.com/apache/arrow.git", branch: "main" diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog index da35e4a4f8d..eb6a6bc37d4 100644 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow-apt-source (19.0.1-1) unstable; urgency=low + + * New upstream release. + + -- Bryce Mecum Mon, 03 Feb 2025 02:02:38 -0000 + apache-arrow-apt-source (19.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in index 6f826ad0a36..8c8e95dde69 100644 --- a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in +++ b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in @@ -102,6 +102,9 @@ else fi %changelog +* Mon Feb 03 2025 Bryce Mecum - 19.0.1-1 +- New upstream release. + * Sat Jan 11 2025 Bryce Mecum - 19.0.0-1 - New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/debian/changelog b/dev/tasks/linux-packages/apache-arrow/debian/changelog index 10acb10b28e..e9846c4bbe2 100644 --- a/dev/tasks/linux-packages/apache-arrow/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow (19.0.1-1) unstable; urgency=low + + * New upstream release. + + -- Bryce Mecum Mon, 03 Feb 2025 02:02:38 -0000 + apache-arrow (19.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in index 9ca9898fa9b..0cc9763d5b3 100644 --- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in @@ -894,6 +894,9 @@ Documentation for Apache Parquet GLib. %endif %changelog +* Mon Feb 03 2025 Bryce Mecum - 19.0.1-1 +- New upstream release. + * Sat Jan 11 2025 Bryce Mecum - 19.0.0-1 - New upstream release. diff --git a/docs/source/_static/theme_overrides.css b/docs/source/_static/theme_overrides.css index f7f25269a66..55403b2ed01 100644 --- a/docs/source/_static/theme_overrides.css +++ b/docs/source/_static/theme_overrides.css @@ -25,14 +25,39 @@ --pst-font-weight-heading: 600; } -/* Change header hight to make the logo a bit larger */ +/* Change header height to make the logo a bit larger */ /* only on wider screens */ -@media only screen and (min-width: 1170px){ + +@media only screen and (min-width: 1200px) { :root { --pst-header-height: 6rem; } } +/* Adjust layout of nav to fit narrower screens */ + +@media only screen and (max-width: 1199px) { + + /* Condense link text in nav to preserve layout */ + .navbar-header-items__center a.nav-link:not(.dropdown-item), + .navbar-header-items__center button.nav-item { + letter-spacing: -0.02em; + } + + /* Shrink search button */ + .search-button__default-text, + .search-button__kbd-shortcut { + display:none !important; + } + + /* Reduce horizontal space between icons in nav and sidebar */ + div.sidebar-header-items__end, + div.navbar-header-items__end, + ul.navbar-icon-links { + column-gap: 0.75rem !important; + } +} + /* Contributing landing page overview cards */ .contrib-card { @@ -84,3 +109,9 @@ dl.cpp.enumerator { p.breathe-sectiondef-title { margin-top: 1rem; } + +/* Keep social icons arranged horizontally in sidebar */ + +.sidebar-header-items__end { + flex-wrap: wrap; +} diff --git a/docs/source/conf.py b/docs/source/conf.py index 9b60c4180c6..e9b926e884a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -342,6 +342,7 @@ }, "header_links_before_dropdown": 2, "header_dropdown_text": "Implementations", + "navbar_align": "left", "navbar_end": ["version-switcher", "theme-switcher", "navbar-icon-links"], "icon_links": [ { @@ -349,6 +350,11 @@ "url": "https://github.com/apache/arrow", "icon": "fa-brands fa-square-github", }, + { + "name": "LinkedIn", + "url": "https://www.linkedin.com/company/apache-arrow/", + "icon": "fa-brands fa-linkedin", + }, { "name": "X", "url": "https://twitter.com/ApacheArrow", diff --git a/js/package.json b/js/package.json index 7aba1c1fee3..29b6ae5852e 100644 --- a/js/package.json +++ b/js/package.json @@ -120,5 +120,5 @@ "engines": { "node": ">=12.0" }, - "version": "19.0.0" + "version": "19.0.1" } diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt index fb02a40a771..c0f0abcf04d 100644 --- a/matlab/CMakeLists.txt +++ b/matlab/CMakeLists.txt @@ -100,7 +100,7 @@ endfunction() set(CMAKE_CXX_STANDARD 17) -set(MLARROW_VERSION "19.0.0") +set(MLARROW_VERSION "19.0.1") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" MLARROW_BASE_VERSION "${MLARROW_VERSION}") project(mlarrow VERSION "${MLARROW_BASE_VERSION}") diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 714c6448453..58e2cdcf073 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -28,7 +28,7 @@ project(pyarrow) # which in turn meant that Py_GIL_DISABLED was not set. set(CMAKE_NO_SYSTEM_FROM_IMPORTED ON) -set(PYARROW_VERSION "19.0.0") +set(PYARROW_VERSION "19.0.1") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" PYARROW_BASE_VERSION "${PYARROW_VERSION}") # Generate SO version and full SO version diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi index 5be6f03f86e..aab9cf1079e 100644 --- a/python/pyarrow/pandas-shim.pxi +++ b/python/pyarrow/pandas-shim.pxi @@ -38,7 +38,7 @@ cdef class _PandasAPIShim(object): object _array_like_types, _is_extension_array_dtype, _lock bint has_sparse bint _pd024 - bint _is_v1, _is_ge_v21, _is_ge_v3, _is_ge_v3_strict + bint _is_v1, _is_ge_v21, _is_ge_v23, _is_ge_v3, _is_ge_v3_strict def __init__(self): self._lock = Lock() @@ -79,6 +79,7 @@ cdef class _PandasAPIShim(object): self._is_v1 = self._loose_version < Version('2.0.0') self._is_ge_v21 = self._loose_version >= Version('2.1.0') + self._is_ge_v23 = self._loose_version >= Version('2.3.0') self._is_ge_v3 = self._loose_version >= Version('3.0.0.dev0') self._is_ge_v3_strict = self._loose_version >= Version('3.0.0') @@ -171,6 +172,10 @@ cdef class _PandasAPIShim(object): self._check_import() return self._is_ge_v21 + def is_ge_v23(self): + self._check_import() + return self._is_ge_v23 + def is_ge_v3(self): self._check_import() return self._is_ge_v3 @@ -183,7 +188,7 @@ cdef class _PandasAPIShim(object): if self.is_ge_v3_strict(): return True try: - if self.pd.options.future.infer_string: + if self.is_ge_v23() and self.pd.options.future.infer_string: return True except: pass diff --git a/python/pyproject.toml b/python/pyproject.toml index ef2043f6d0b..212ea926bd6 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -81,4 +81,4 @@ root = '..' version_file = 'pyarrow/_generated_version.py' version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' -fallback_version = '19.0.0' +fallback_version = '19.0.1' diff --git a/r/DESCRIPTION b/r/DESCRIPTION index cd57f9a524a..a31e832519d 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -1,6 +1,6 @@ Package: arrow Title: Integration to 'Apache' 'Arrow' -Version: 19.0.0 +Version: 19.0.1 Authors@R: c( person("Neal", "Richardson", email = "neal.p.richardson@gmail.com", role = c("aut")), person("Ian", "Cook", email = "ianmcook@gmail.com", role = c("aut")), diff --git a/r/NEWS.md b/r/NEWS.md index 437867d3ad4..343ce054749 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -17,7 +17,7 @@ under the License. --> -# arrow 19.0.0 +# arrow 19.0.1 # arrow 18.1.0 diff --git a/r/pkgdown/assets/versions.html b/r/pkgdown/assets/versions.html index cbeff74fa4c..db8a97badb9 100644 --- a/r/pkgdown/assets/versions.html +++ b/r/pkgdown/assets/versions.html @@ -1,7 +1,7 @@ -

19.0.0.9000 (dev)

-

19.0.0 (release)

+

19.0.1.9000 (dev)

+

19.0.1 (release)

18.1.0

17.0.0

16.1.0

diff --git a/r/pkgdown/assets/versions.json b/r/pkgdown/assets/versions.json index 105c4d2f3d0..ea0d663b2a6 100644 --- a/r/pkgdown/assets/versions.json +++ b/r/pkgdown/assets/versions.json @@ -1,10 +1,10 @@ [ { - "name": "19.0.0.9000 (dev)", + "name": "19.0.1.9000 (dev)", "version": "dev/" }, { - "name": "19.0.0 (release)", + "name": "19.0.1 (release)", "version": "" }, { diff --git a/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb b/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb index 2a659bdca19..4888deb0a16 100644 --- a/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb +++ b/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowCUDA - VERSION = "19.0.0" + VERSION = "19.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb b/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb index 9b70cd29321..71822d8088f 100644 --- a/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb +++ b/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowDataset - VERSION = "19.0.0" + VERSION = "19.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow-flight-sql/lib/arrow-flight-sql/version.rb b/ruby/red-arrow-flight-sql/lib/arrow-flight-sql/version.rb index f2cd2f51758..f42312670ea 100644 --- a/ruby/red-arrow-flight-sql/lib/arrow-flight-sql/version.rb +++ b/ruby/red-arrow-flight-sql/lib/arrow-flight-sql/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowFlightSQL - VERSION = "19.0.0" + VERSION = "19.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow-flight/lib/arrow-flight/version.rb b/ruby/red-arrow-flight/lib/arrow-flight/version.rb index 70e896ce3d4..e5b58e0b42a 100644 --- a/ruby/red-arrow-flight/lib/arrow-flight/version.rb +++ b/ruby/red-arrow-flight/lib/arrow-flight/version.rb @@ -16,7 +16,7 @@ # under the License. module ArrowFlight - VERSION = "19.0.0" + VERSION = "19.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-arrow/lib/arrow/version.rb b/ruby/red-arrow/lib/arrow/version.rb index f369fd223a9..56a1393e403 100644 --- a/ruby/red-arrow/lib/arrow/version.rb +++ b/ruby/red-arrow/lib/arrow/version.rb @@ -16,7 +16,7 @@ # under the License. module Arrow - VERSION = "19.0.0" + VERSION = "19.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-gandiva/lib/gandiva/version.rb b/ruby/red-gandiva/lib/gandiva/version.rb index 5226a34f1e0..ea2591701e2 100644 --- a/ruby/red-gandiva/lib/gandiva/version.rb +++ b/ruby/red-gandiva/lib/gandiva/version.rb @@ -16,7 +16,7 @@ # under the License. module Gandiva - VERSION = "19.0.0" + VERSION = "19.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/ruby/red-parquet/lib/parquet/version.rb b/ruby/red-parquet/lib/parquet/version.rb index e5c90507441..4109fc40434 100644 --- a/ruby/red-parquet/lib/parquet/version.rb +++ b/ruby/red-parquet/lib/parquet/version.rb @@ -16,7 +16,7 @@ # under the License. module Parquet - VERSION = "19.0.0" + VERSION = "19.0.1" module Version numbers, TAG = VERSION.split("-") diff --git a/testing b/testing index 4d209492d51..d2a13712303 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 4d209492d514c2d3cb2d392681b9aa00e6d8da1c +Subproject commit d2a13712303498963395318a4eb42872e66aead7