Skip to content

Commit 32a112d

Browse files
authored
GH-47102: [Statistics][C++] Implement Statistics specification attribute ARROW:max_byte_width:{exact,approximate} Component: C++ (#47463)
### Rationale for this change Add` max_byte_width statistics{exact,approxiamte} `statistics attributes ### What changes are included in this PR? Add `arrow::ArrayStatistics::max_byte_width` with relevant unit tests ### Are these changes tested? Yes, I ran the related unit tests ### Are there any user-facing changes? Yes, Add `arrow::ArrayStatistics::max_byte_width` * GitHub Issue: #47102 Authored-by: Arash Andishgar <[email protected]> Signed-off-by: Sutou Kouhei <[email protected]>
1 parent fddd356 commit 32a112d

File tree

6 files changed

+138
-0
lines changed

6 files changed

+138
-0
lines changed

cpp/src/arrow/array/array_test.cc

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3911,6 +3911,7 @@ class TestArrayDataStatistics : public ::testing::Test {
39113911
valids_ = {1, 0, 1, 1};
39123912
null_count_ = std::count(valids_.begin(), valids_.end(), 0);
39133913
distinct_count_ = 3.0;
3914+
max_byte_width_ = 4.0;
39143915
average_byte_width_ = 4.0;
39153916
null_buffer_ = *internal::BytesToBits(valids_);
39163917
values_ = {1, 0, 3, -4};
@@ -3922,6 +3923,7 @@ class TestArrayDataStatistics : public ::testing::Test {
39223923
data_->statistics = std::make_shared<ArrayStatistics>();
39233924
data_->statistics->null_count = null_count_;
39243925
data_->statistics->distinct_count = distinct_count_;
3926+
data_->statistics->max_byte_width = max_byte_width_;
39253927
data_->statistics->average_byte_width = average_byte_width_;
39263928
data_->statistics->is_average_byte_width_exact = true;
39273929
data_->statistics->min = min_;
@@ -3934,6 +3936,7 @@ class TestArrayDataStatistics : public ::testing::Test {
39343936
std::vector<uint8_t> valids_;
39353937
size_t null_count_;
39363938
double distinct_count_;
3939+
double max_byte_width_;
39373940
double average_byte_width_;
39383941
std::shared_ptr<Buffer> null_buffer_;
39393942
std::vector<int32_t> values_;
@@ -3954,6 +3957,10 @@ TEST_F(TestArrayDataStatistics, MoveConstructor) {
39543957
ASSERT_DOUBLE_EQ(distinct_count_,
39553958
std::get<double>(moved_data.statistics->distinct_count.value()));
39563959

3960+
ASSERT_TRUE(moved_data.statistics->max_byte_width.has_value());
3961+
ASSERT_DOUBLE_EQ(max_byte_width_,
3962+
std::get<double>(moved_data.statistics->max_byte_width.value()));
3963+
39573964
ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value());
39583965
ASSERT_DOUBLE_EQ(average_byte_width_,
39593966
moved_data.statistics->average_byte_width.value());
@@ -3980,6 +3987,10 @@ TEST_F(TestArrayDataStatistics, CopyConstructor) {
39803987
ASSERT_DOUBLE_EQ(distinct_count_,
39813988
std::get<double>(copied_data.statistics->distinct_count.value()));
39823989

3990+
ASSERT_TRUE(copied_data.statistics->max_byte_width.has_value());
3991+
ASSERT_DOUBLE_EQ(max_byte_width_,
3992+
std::get<double>(copied_data.statistics->max_byte_width.value()));
3993+
39833994
ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value());
39843995
ASSERT_DOUBLE_EQ(average_byte_width_,
39853996
copied_data.statistics->average_byte_width.value());
@@ -4008,6 +4019,10 @@ TEST_F(TestArrayDataStatistics, MoveAssignment) {
40084019
ASSERT_DOUBLE_EQ(distinct_count_,
40094020
std::get<double>(moved_data.statistics->distinct_count.value()));
40104021

4022+
ASSERT_TRUE(moved_data.statistics->max_byte_width.has_value());
4023+
ASSERT_DOUBLE_EQ(max_byte_width_,
4024+
std::get<double>(moved_data.statistics->max_byte_width.value()));
4025+
40114026
ASSERT_TRUE(moved_data.statistics->average_byte_width.has_value());
40124027
ASSERT_DOUBLE_EQ(average_byte_width_,
40134028
moved_data.statistics->average_byte_width.value());
@@ -4035,6 +4050,10 @@ TEST_F(TestArrayDataStatistics, CopyAssignment) {
40354050
ASSERT_DOUBLE_EQ(distinct_count_,
40364051
std::get<double>(copied_data.statistics->distinct_count.value()));
40374052

4053+
ASSERT_TRUE(copied_data.statistics->max_byte_width.has_value());
4054+
ASSERT_DOUBLE_EQ(max_byte_width_,
4055+
std::get<double>(copied_data.statistics->max_byte_width.value()));
4056+
40384057
ASSERT_TRUE(copied_data.statistics->average_byte_width.has_value());
40394058
ASSERT_DOUBLE_EQ(average_byte_width_,
40404059
copied_data.statistics->average_byte_width.value());

cpp/src/arrow/array/statistics.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ struct ARROW_EXPORT ArrayStatistics {
4141
using ValueType = std::variant<bool, int64_t, uint64_t, double, std::string>;
4242
using NumericType = std::variant<int64_t, double>;
4343
using CountType = NumericType;
44+
using SizeType = NumericType;
4445

4546
static const std::shared_ptr<DataType>& ValueToArrowType(
4647
const std::optional<ValueType>& value,
@@ -82,6 +83,11 @@ struct ARROW_EXPORT ArrayStatistics {
8283
/// and when set to `double`, it represents `approximate_distinct_count`.
8384
std::optional<CountType> distinct_count = std::nullopt;
8485

86+
/// \brief The maximum length in bytes of the rows in an array; may not be set
87+
/// Note: when the type is `int64_t`, it represents `max_byte_width_exact`,
88+
/// and when the type is `double`, it represents `max_byte_width_approximate`.
89+
std::optional<SizeType> max_byte_width = std::nullopt;
90+
8591
/// \brief The average size in bytes of a row in an array, may not be set.
8692
std::optional<double> average_byte_width = std::nullopt;
8793

cpp/src/arrow/array/statistics_test.cc

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,22 @@ TEST(TestArrayStatistics, DistinctCountApproximate) {
4949
ASSERT_DOUBLE_EQ(29.0, std::get<double>(statistics.distinct_count.value()));
5050
}
5151

52+
TEST(TestArrayStatistics, MaxByteWidthExact) {
53+
ArrayStatistics statistics;
54+
ASSERT_FALSE(statistics.max_byte_width.has_value());
55+
statistics.max_byte_width = static_cast<int64_t>(5);
56+
ASSERT_TRUE(statistics.max_byte_width.has_value());
57+
ASSERT_EQ(5, std::get<int64_t>(statistics.max_byte_width.value()));
58+
}
59+
60+
TEST(TestArrayStatistics, MaxByteWidthApproximate) {
61+
ArrayStatistics statistics;
62+
ASSERT_FALSE(statistics.max_byte_width.has_value());
63+
statistics.max_byte_width = 5.0;
64+
ASSERT_TRUE(statistics.max_byte_width.has_value());
65+
ASSERT_DOUBLE_EQ(5.0, std::get<double>(statistics.max_byte_width.value()));
66+
}
67+
5268
TEST(TestArrayStatistics, AverageByteWidth) {
5369
ArrayStatistics statistics;
5470
ASSERT_FALSE(statistics.average_byte_width.has_value());
@@ -107,6 +123,18 @@ TEST(TestArrayStatistics, Equals) {
107123
statistics2.distinct_count = 2930.5;
108124
ASSERT_EQ(statistics1, statistics2);
109125

126+
// Test MAX_BYTE_WIDTH_EXACT
127+
statistics1.max_byte_width = static_cast<int64_t>(5);
128+
ASSERT_NE(statistics1, statistics2);
129+
statistics2.max_byte_width = static_cast<int64_t>(5);
130+
ASSERT_EQ(statistics1, statistics2);
131+
132+
// Test MAX_BYTE_WIDTH_APPROXIMATE
133+
statistics1.max_byte_width = 5.0;
134+
ASSERT_NE(statistics1, statistics2);
135+
statistics2.max_byte_width = 5.0;
136+
ASSERT_EQ(statistics1, statistics2);
137+
110138
statistics1.average_byte_width = 2.9;
111139
ASSERT_NE(statistics1, statistics2);
112140
statistics2.average_byte_width = 2.9;

cpp/src/arrow/compare.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1566,6 +1566,8 @@ bool ArrayStatisticsEqualsImpl(const ArrayStatistics& left, const ArrayStatistic
15661566
return left.null_count == right.null_count &&
15671567
ArrayStatisticsOptionalValueEquals(left.distinct_count, right.distinct_count,
15681568
equal_options) &&
1569+
ArrayStatisticsOptionalValueEquals(left.max_byte_width, right.max_byte_width,
1570+
equal_options) &&
15691571
left.is_average_byte_width_exact == right.is_average_byte_width_exact &&
15701572
left.is_min_exact == right.is_min_exact &&
15711573
left.is_max_exact == right.is_max_exact &&

cpp/src/arrow/record_batch.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,22 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat
564564
statistics.start_new_column = false;
565565
}
566566

567+
if (column_statistics->max_byte_width.has_value()) {
568+
statistics.nth_statistics++;
569+
if (std::holds_alternative<int64_t>(column_statistics->max_byte_width.value())) {
570+
statistics.key = ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_EXACT;
571+
statistics.type = int64();
572+
statistics.value = std::get<int64_t>(column_statistics->max_byte_width.value());
573+
} else {
574+
statistics.key = ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_APPROXIMATE;
575+
statistics.type = float64();
576+
statistics.value = std::get<double>(column_statistics->max_byte_width.value());
577+
}
578+
579+
RETURN_NOT_OK(on_statistics(statistics));
580+
statistics.start_new_column = false;
581+
}
582+
567583
if (column_statistics->average_byte_width.has_value()) {
568584
statistics.nth_statistics++;
569585
if (column_statistics->is_average_byte_width_exact) {

cpp/src/arrow/record_batch_test.cc

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1538,6 +1538,73 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayDistinctCountApproximate) {
15381538
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
15391539
}
15401540

1541+
TEST_F(TestRecordBatch, MakeStatisticsArrayMaxByteWidthExact) {
1542+
auto schema =
1543+
::arrow::schema({field("no-statistics", boolean()), field("utf8", utf8())});
1544+
auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
1545+
auto string_array_data = ArrayFromJSON(utf8(), R"(["aa", null, "c"])")->data()->Copy();
1546+
string_array_data->statistics = std::make_shared<ArrayStatistics>();
1547+
string_array_data->statistics->null_count = 1;
1548+
string_array_data->statistics->max_byte_width = static_cast<int64_t>(2);
1549+
auto string_array = MakeArray(std::move(string_array_data));
1550+
auto batch = RecordBatch::Make(schema, string_array->length(),
1551+
{no_statistics_array, string_array});
1552+
1553+
ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
1554+
1555+
ASSERT_OK_AND_ASSIGN(auto expected_statistics_array,
1556+
MakeStatisticsArray("[null, 1]",
1557+
{{
1558+
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
1559+
},
1560+
{
1561+
ARROW_STATISTICS_KEY_NULL_COUNT_EXACT,
1562+
ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_EXACT,
1563+
}},
1564+
{{
1565+
ArrayStatistics::ValueType{int64_t{3}},
1566+
},
1567+
{
1568+
ArrayStatistics::ValueType{int64_t{1}},
1569+
ArrayStatistics::ValueType{int64_t{2}},
1570+
}}));
1571+
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
1572+
}
1573+
1574+
TEST_F(TestRecordBatch, MakeStatisticsArrayMaxByteWidthApproximate) {
1575+
auto schema =
1576+
::arrow::schema({field("no-statistics", boolean()), field("utf8", utf8())});
1577+
auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
1578+
auto string_array_data = ArrayFromJSON(utf8(), R"(["aa", null, "c"])")->data()->Copy();
1579+
string_array_data->statistics = std::make_shared<ArrayStatistics>();
1580+
string_array_data->statistics->null_count = 1;
1581+
string_array_data->statistics->max_byte_width = 2.0;
1582+
auto string_array = MakeArray(std::move(string_array_data));
1583+
auto batch = RecordBatch::Make(schema, string_array->length(),
1584+
{no_statistics_array, string_array});
1585+
1586+
ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
1587+
1588+
ASSERT_OK_AND_ASSIGN(
1589+
auto expected_statistics_array,
1590+
MakeStatisticsArray("[null, 1]",
1591+
{{
1592+
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
1593+
},
1594+
{
1595+
ARROW_STATISTICS_KEY_NULL_COUNT_EXACT,
1596+
ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_APPROXIMATE,
1597+
}},
1598+
{{
1599+
ArrayStatistics::ValueType{int64_t{3}},
1600+
},
1601+
{
1602+
ArrayStatistics::ValueType{int64_t{1}},
1603+
ArrayStatistics::ValueType{2.0},
1604+
}}));
1605+
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
1606+
}
1607+
15411608
TEST_F(TestRecordBatch, MakeStatisticsArrayAverageByteWidthApproximate) {
15421609
auto schema =
15431610
::arrow::schema({field("no-statistics", boolean()), field("utf8", utf8())});

0 commit comments

Comments
 (0)