diff --git a/c_glib/arrow-glib/basic-array.cpp b/c_glib/arrow-glib/basic-array.cpp index 8320e30c99cc..34b577ea1af8 100644 --- a/c_glib/arrow-glib/basic-array.cpp +++ b/c_glib/arrow-glib/basic-array.cpp @@ -456,6 +456,22 @@ garrow_array_statistics_has_null_count(GArrowArrayStatistics *statistics) return priv->statistics.null_count.has_value(); } +/** + * garrow_array_statistics_is_null_count_exact: + * @statistics: A #GArrowArrayStatistics. + * + * Returns: %TRUE if the null count is available and exact, %FALSE otherwise. + * + * Since: 23.0.0 + */ +gboolean +garrow_array_statistics_is_null_count_exact(GArrowArrayStatistics *statistics) +{ + auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(statistics); + return priv->statistics.null_count.has_value() && + std::holds_alternative(*priv->statistics.null_count); +} + /** * garrow_array_statistics_get_null_count: * @statistics: A #GArrowArrayStatistics. @@ -464,19 +480,59 @@ garrow_array_statistics_has_null_count(GArrowArrayStatistics *statistics) * -1 otherwise. * * Since: 20.0.0 + * + * Deprecated: 23.0.0. Use garrow_array_statistics_is_null_count_exact(), + * garrow_array_statistics_get_null_count_exact() and + * garrow_array_statistics_get_null_count_approximate() instead. */ gint64 garrow_array_statistics_get_null_count(GArrowArrayStatistics *statistics) +{ + return garrow_array_statistics_get_null_count_exact(statistics); +} + +/** + * garrow_array_statistics_get_null_count_exact: + * @statistics: A #GArrowArrayStatistics. + * + * Returns: 0 or larger value if @statistics has a valid exact null + * count value, -1 otherwise. + * + * Since: 23.0.0 + */ +gint64 +garrow_array_statistics_get_null_count_exact(GArrowArrayStatistics *statistics) { auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(statistics); const auto &null_count = priv->statistics.null_count; - if (null_count) { - return null_count.value(); + if (null_count && std::holds_alternative(*null_count)) { + return std::get(*null_count); } else { return -1; } } +/** + * garrow_array_statistics_get_null_count_approximate: + * @statistics: A #GArrowArrayStatistics. + * + * Returns: Non `NaN` value if @statistics has a valid approximate + * null count value, `NaN` otherwise. + * + * Since: 23.0.0 + */ +gdouble +garrow_array_statistics_get_null_count_approximate(GArrowArrayStatistics *statistics) +{ + auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(statistics); + const auto &null_count = priv->statistics.null_count; + if (null_count && std::holds_alternative(*null_count)) { + return std::get(*null_count); + } else { + return std::nan(""); + } +} + /** * garrow_array_statistics_has_distinct_count: * @statistics: A #GArrowArrayStatistics. diff --git a/c_glib/arrow-glib/basic-array.h b/c_glib/arrow-glib/basic-array.h index 4021c16f060d..37f61a91cca7 100644 --- a/c_glib/arrow-glib/basic-array.h +++ b/c_glib/arrow-glib/basic-array.h @@ -54,9 +54,21 @@ struct _GArrowArrayStatisticsClass GARROW_AVAILABLE_IN_20_0 gboolean garrow_array_statistics_has_null_count(GArrowArrayStatistics *statistics); +GARROW_AVAILABLE_IN_23_0 +gboolean +garrow_array_statistics_is_null_count_exact(GArrowArrayStatistics *statistics); +#ifndef GARROW_DISABLE_DEPRECATED GARROW_AVAILABLE_IN_20_0 +GARROW_DEPRECATED_IN_23_0_FOR(garrow_array_statistics_get_null_count_exact) gint64 garrow_array_statistics_get_null_count(GArrowArrayStatistics *statistics); +#endif +GARROW_AVAILABLE_IN_23_0 +gint64 +garrow_array_statistics_get_null_count_exact(GArrowArrayStatistics *statistics); +GARROW_AVAILABLE_IN_23_0 +gdouble +garrow_array_statistics_get_null_count_approximate(GArrowArrayStatistics *statistics); GARROW_AVAILABLE_IN_21_0 gboolean diff --git a/c_glib/test/test-array-statistics.rb b/c_glib/test/test-array-statistics.rb index 03407b7e3402..34c0e4aa7a83 100644 --- a/c_glib/test/test-array-statistics.rb +++ b/c_glib/test/test-array-statistics.rb @@ -45,8 +45,20 @@ def setup end end - test("#null_count") do - assert_equal(1, @statistics.null_count) + test("#null_count_exact?") do + assert do + @statistics.null_count_exact? + end + end + + test("#null_count_exact") do + assert_equal(1, @statistics.null_count_exact) + end + + test("#null_count_approximate") do + assert do + @statistics.null_count_approximate.nan? + end end test("#has_distinct_count?") do diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index b40f14a55475..f96b1c18bb6e 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -3934,7 +3934,7 @@ class TestArrayDataStatistics : public ::testing::Test { protected: std::vector valids_; - size_t null_count_; + int64_t null_count_; double distinct_count_; double max_byte_width_; double average_byte_width_; @@ -3951,7 +3951,7 @@ TEST_F(TestArrayDataStatistics, MoveConstructor) { ArrayData moved_data(std::move(copied_data)); ASSERT_TRUE(moved_data.statistics->null_count.has_value()); - ASSERT_EQ(null_count_, moved_data.statistics->null_count.value()); + ASSERT_EQ(null_count_, std::get(moved_data.statistics->null_count.value())); ASSERT_TRUE(moved_data.statistics->distinct_count.has_value()); ASSERT_DOUBLE_EQ(distinct_count_, @@ -3981,7 +3981,7 @@ TEST_F(TestArrayDataStatistics, CopyConstructor) { ArrayData copied_data(*data_); ASSERT_TRUE(copied_data.statistics->null_count.has_value()); - ASSERT_EQ(null_count_, copied_data.statistics->null_count.value()); + ASSERT_EQ(null_count_, std::get(copied_data.statistics->null_count.value())); ASSERT_TRUE(copied_data.statistics->distinct_count.has_value()); ASSERT_DOUBLE_EQ(distinct_count_, @@ -4013,7 +4013,7 @@ TEST_F(TestArrayDataStatistics, MoveAssignment) { moved_data = std::move(copied_data); ASSERT_TRUE(moved_data.statistics->null_count.has_value()); - ASSERT_EQ(null_count_, moved_data.statistics->null_count.value()); + ASSERT_EQ(null_count_, std::get(moved_data.statistics->null_count.value())); ASSERT_TRUE(moved_data.statistics->distinct_count.has_value()); ASSERT_DOUBLE_EQ(distinct_count_, @@ -4044,7 +4044,7 @@ TEST_F(TestArrayDataStatistics, CopyAssignment) { copied_data = *data_; ASSERT_TRUE(copied_data.statistics->null_count.has_value()); - ASSERT_EQ(null_count_, copied_data.statistics->null_count.value()); + ASSERT_EQ(null_count_, std::get(copied_data.statistics->null_count.value())); ASSERT_TRUE(copied_data.statistics->distinct_count.has_value()); ASSERT_DOUBLE_EQ(distinct_count_, @@ -4075,7 +4075,7 @@ TEST_F(TestArrayDataStatistics, CopyTo) { data_->CopyTo(arrow::default_cpu_memory_manager())); ASSERT_TRUE(copied_data->statistics->null_count.has_value()); - ASSERT_EQ(null_count_, copied_data->statistics->null_count.value()); + ASSERT_EQ(null_count_, std::get(copied_data->statistics->null_count.value())); ASSERT_TRUE(copied_data->statistics->min.has_value()); ASSERT_TRUE(std::holds_alternative(copied_data->statistics->min.value())); diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index cbf0bc39e811..7ed0b8a009ef 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -76,7 +76,9 @@ struct ARROW_EXPORT ArrayStatistics { } /// \brief The number of null values, may not be set - std::optional null_count = std::nullopt; + /// Note: when set to `int64_t`, it represents `exact_null_count`, + /// and when set to `double`, it represents `approximate_null_count`. + std::optional null_count = std::nullopt; /// \brief The number of distinct values, may not be set /// Note: when set to `int64_t`, it represents `exact_distinct_count`, diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc index 607ee8aa09fe..fba3545dabe1 100644 --- a/cpp/src/arrow/array/statistics_test.cc +++ b/cpp/src/arrow/array/statistics_test.cc @@ -25,12 +25,20 @@ namespace arrow { -TEST(TestArrayStatistics, NullCount) { +TEST(TestArrayStatistics, NullCountExact) { ArrayStatistics statistics; ASSERT_FALSE(statistics.null_count.has_value()); statistics.null_count = 29; ASSERT_TRUE(statistics.null_count.has_value()); - ASSERT_EQ(29, statistics.null_count.value()); + ASSERT_EQ(29, std::get(statistics.null_count.value())); +} + +TEST(TestArrayStatistics, NullCountApproximate) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.null_count.has_value()); + statistics.null_count = 29.0; + ASSERT_TRUE(statistics.null_count.has_value()); + ASSERT_DOUBLE_EQ(29.0, std::get(statistics.null_count.value())); } TEST(TestArrayStatistics, DistinctCountExact) { @@ -106,11 +114,18 @@ TEST(TestArrayStatistics, Equals) { ASSERT_EQ(statistics1, statistics2); + // Test NULL_COUNT_EXACT statistics1.null_count = 29; ASSERT_NE(statistics1, statistics2); statistics2.null_count = 29; ASSERT_EQ(statistics1, statistics2); + // Test NULL_COUNT_APPROXIMATE + statistics1.null_count = 29.0; + ASSERT_NE(statistics1, statistics2); + statistics2.null_count = 29.0; + ASSERT_EQ(statistics1, statistics2); + // Test DISTINCT_COUNT_EXACT statistics1.distinct_count = static_cast(2929); ASSERT_NE(statistics1, statistics2); diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index a86d8ba6734d..e7d4f400fb6f 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -1563,7 +1563,8 @@ bool ArrayStatisticsOptionalValueEquals(const std::optional& left, bool ArrayStatisticsEqualsImpl(const ArrayStatistics& left, const ArrayStatistics& right, const EqualOptions& equal_options) { - return left.null_count == right.null_count && + return ArrayStatisticsOptionalValueEquals(left.null_count, right.null_count, + equal_options) && ArrayStatisticsOptionalValueEquals(left.distinct_count, right.distinct_count, equal_options) && ArrayStatisticsOptionalValueEquals(left.max_byte_width, right.max_byte_width, diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 994c4c4a8de0..65c5c56c4af6 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -536,10 +536,17 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat statistics.nth_column = nth_column; if (column_statistics->null_count.has_value()) { statistics.nth_statistics++; - statistics.key = ARROW_STATISTICS_KEY_NULL_COUNT_EXACT; - statistics.type = int64(); - statistics.value = column_statistics->null_count.value(); - RETURN_NOT_OK(on_statistics(statistics)); + if (std::holds_alternative(column_statistics->null_count.value())) { + statistics.key = ARROW_STATISTICS_KEY_NULL_COUNT_EXACT; + statistics.type = int64(); + statistics.value = std::get(column_statistics->null_count.value()); + RETURN_NOT_OK(on_statistics(statistics)); + } else { + statistics.key = ARROW_STATISTICS_KEY_NULL_COUNT_APPROXIMATE; + statistics.type = float64(); + statistics.value = std::get(column_statistics->null_count.value()); + RETURN_NOT_OK(on_statistics(statistics)); + } statistics.start_new_column = false; } diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 4be29e730c18..4516b808a84f 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -1456,7 +1456,7 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayRowCount) { AssertArraysEqual(*expected_statistics_array, *statistics_array, true); } -TEST_F(TestRecordBatch, MakeStatisticsArrayNullCount) { +TEST_F(TestRecordBatch, MakeStatisticsArrayNullCountExact) { auto schema = ::arrow::schema({field("no-statistics", boolean()), field("int32", int32())}); auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); @@ -1486,6 +1486,37 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayNullCount) { AssertArraysEqual(*expected_statistics_array, *statistics_array, true); } +TEST_F(TestRecordBatch, MakeStatisticsArrayNullCountApproximate) { + auto schema = + ::arrow::schema({field("no-statistics", boolean()), field("int32", int32())}); + auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]"); + auto int32_array_data = ArrayFromJSON(int32(), "[1, null, -1]")->data()->Copy(); + int32_array_data->statistics = std::make_shared(); + int32_array_data->statistics->null_count = 1.0; + auto int32_array = MakeArray(std::move(int32_array_data)); + auto batch = RecordBatch::Make(schema, int32_array->length(), + {no_statistics_array, int32_array}); + + ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray()); + + ASSERT_OK_AND_ASSIGN( + auto expected_statistics_array, + MakeStatisticsArray("[null, 1]", + {{ + ARROW_STATISTICS_KEY_ROW_COUNT_EXACT, + }, + { + ARROW_STATISTICS_KEY_NULL_COUNT_APPROXIMATE, + }}, + {{ + ArrayStatistics::ValueType{int64_t{3}}, + }, + { + ArrayStatistics::ValueType{1.0}, + }})); + AssertArraysEqual(*expected_statistics_array, *statistics_array, true); +} + TEST_F(TestRecordBatch, MakeStatisticsArrayDistinctCountExact) { auto schema = ::arrow::schema({field("no-statistics", boolean()), field("int32", int32())}); diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc b/cpp/src/parquet/arrow/arrow_statistics_test.cc index 048518644c6e..4b97d04fe978 100644 --- a/cpp/src/parquet/arrow/arrow_statistics_test.cc +++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc @@ -236,7 +236,8 @@ void TestStatisticsReadArray(std::shared_ptr<::arrow::DataType> arrow_type) { auto statistics = typed_read_array->statistics(); ASSERT_NE(nullptr, statistics); ASSERT_EQ(true, statistics->null_count.has_value()); - ASSERT_EQ(1, statistics->null_count.value()); + ASSERT_EQ(true, std::holds_alternative(statistics->null_count.value())); + ASSERT_EQ(1, std::get(statistics->null_count.value())); ASSERT_EQ(false, statistics->distinct_count.has_value()); ASSERT_EQ(true, statistics->min.has_value()); ASSERT_EQ(true, std::holds_alternative(*statistics->min)); @@ -356,7 +357,8 @@ TEST(TestStatisticsRead, MultipleRowGroupsShouldLoadStatistics) { auto statistics = typed_read_array->statistics(); ASSERT_NE(nullptr, statistics); ASSERT_EQ(true, statistics->null_count.has_value()); - ASSERT_EQ(1, statistics->null_count.value()); + ASSERT_EQ(true, std::holds_alternative(statistics->null_count.value())); + ASSERT_EQ(1, std::get(statistics->null_count.value())); ASSERT_EQ(false, statistics->distinct_count.has_value()); ASSERT_EQ(true, statistics->min.has_value()); // This is not -1 because this array has only the first 2 elements. diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index bf5beab589df..575b628db3ae 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -766,10 +766,24 @@ cdef class ArrayStatistics(_Weakrefable): null_count = self.sp_statistics.get().null_count # We'll be able to simplify this after # https://github.com/cython/cython/issues/6692 is solved. - if null_count.has_value(): - return null_count.value() - else: + if not null_count.has_value(): return None + value = null_count.value() + if holds_alternative[int64_t](value): + return get[int64_t](value) + else: + return get[double](value) + + @property + def is_null_count_exact(self): + """ + Whether the number of null values is a valid exact value or not. + """ + null_count = self.sp_statistics.get().null_count + if not null_count.has_value(): + return False + value = null_count.value() + return holds_alternative[int64_t](value) @property def distinct_count(self): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index f294ee4d50b0..c03bf20026e8 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -206,7 +206,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_bool is_numeric(Type type) cdef cppclass CArrayStatistics" arrow::ArrayStatistics": - optional[int64_t] null_count + optional[CArrayStatisticsCountType] null_count optional[CArrayStatisticsCountType] distinct_count optional[CArrayStatisticsValueType] min c_bool is_min_exact diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py b/python/pyarrow/tests/parquet/test_parquet_file.py index 8eab9bc1740e..a62b5c3298c9 100644 --- a/python/pyarrow/tests/parquet/test_parquet_file.py +++ b/python/pyarrow/tests/parquet/test_parquet_file.py @@ -346,6 +346,7 @@ def test_read_statistics(): buf.seek(0) statistics = pq.ParquetFile(buf).read().columns[0].chunks[0].statistics + assert statistics.is_null_count_exact is True assert statistics.null_count == 1 assert statistics.distinct_count is None # TODO: add tests for is_distinct_count_exact == None and True diff --git a/ruby/red-arrow/lib/arrow/array-statistics.rb b/ruby/red-arrow/lib/arrow/array-statistics.rb index 295d1e4ffa30..81dbeef05d74 100644 --- a/ruby/red-arrow/lib/arrow/array-statistics.rb +++ b/ruby/red-arrow/lib/arrow/array-statistics.rb @@ -17,6 +17,18 @@ module Arrow class ArrayStatistics + if method_defined?(:null_count_exact) + alias_method :null_count_raw, :null_count + def null_count + return nil unless has_null_count? + if null_count_exact? + null_count_exact + else + null_count_approximate + end + end + end + if method_defined?(:distinct_count_exact) alias_method :distinct_count_raw, :distinct_count def distinct_count diff --git a/ruby/red-parquet/test/test-array-statistics.rb b/ruby/red-parquet/test/test-array-statistics.rb index 9e320eb5d357..7c5246c01610 100644 --- a/ruby/red-parquet/test/test-array-statistics.rb +++ b/ruby/red-parquet/test/test-array-statistics.rb @@ -27,6 +27,16 @@ def setup @statistics = loaded_table[:int64].data.chunks[0].statistics end + def test_null_count + assert do + @statistics.has_null_count? + end + assert do + @statistics.null_count_exact? + end + assert_equal(1, @statistics.null_count) + end + def test_distinct_count assert do not @statistics.has_distinct_count?