Skip to content

Commit 4d566e6

Browse files
authored
GH-45664: [C++] Allow LargeString,LargeBinary,FixedSizeBinary,StringView and BinaryView for RecordBatch::MakeStatisticsArray() (#46031)
### Rationale for this change MakeStatisticsArray in RecordBatch does not support StringView, BinaryView, LargeString, LargeBinary, FixedSizeBinary types. ### What changes are included in this PR? The correction of MakeStatisticsArray in RecordBatch to support all arrow string types and relevant tests. Additionally, some changes applied to MakeStatisticsArray in record_batch_test.cc ### Are these changes tested? Yes, I run all relevant unit tests ### Are there any user-facing changes? Yes. (Add support for `large_utf8`, `large_binary`, `fixed_size_binary`, `StringView`, and `BinaryView` to `RecordBatch::MakeStatisticsArray()`). * GitHub Issue: #45664 Authored-by: Arash Andishgar <[email protected]> Signed-off-by: Sutou Kouhei <[email protected]>
1 parent 520ae44 commit 4d566e6

File tree

4 files changed

+238
-58
lines changed

4 files changed

+238
-58
lines changed

cpp/src/arrow/array/statistics.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ struct ARROW_EXPORT ArrayStatistics {
6060
case Type::FIXED_SIZE_BINARY:
6161
case Type::LARGE_STRING:
6262
case Type::LARGE_BINARY:
63+
case Type::BINARY_VIEW:
64+
case Type::STRING_VIEW:
6365
return array_type;
6466
default:
6567
return utf8();

cpp/src/arrow/record_batch.cc

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,15 @@
3232
#include "arrow/array/builder_nested.h"
3333
#include "arrow/array/builder_union.h"
3434
#include "arrow/array/concatenate.h"
35+
#include "arrow/array/statistics.h"
3536
#include "arrow/array/validate.h"
3637
#include "arrow/c/abi.h"
3738
#include "arrow/pretty_print.h"
3839
#include "arrow/status.h"
3940
#include "arrow/table.h"
4041
#include "arrow/tensor.h"
4142
#include "arrow/type.h"
43+
#include "arrow/type_traits.h"
4244
#include "arrow/util/iterator.h"
4345
#include "arrow/util/logging_internal.h"
4446
#include "arrow/util/vector.h"
@@ -556,6 +558,21 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat
556558
}
557559
return Status::OK();
558560
}
561+
struct StringBuilderVisitor {
562+
template <typename DataType>
563+
enable_if_has_string_view<DataType, Status> Visit(const DataType&,
564+
ArrayBuilder* raw_builder,
565+
const std::string& value) {
566+
using Builder = typename TypeTraits<DataType>::BuilderType;
567+
auto builder = static_cast<Builder*>(raw_builder);
568+
return builder->Append(value);
569+
}
570+
571+
Status Visit(const DataType& type, ArrayBuilder*, const std::string&) {
572+
return Status::Invalid("Only string types are supported and the current type is ",
573+
type.ToString());
574+
}
575+
};
559576
} // namespace
560577

561578
Result<std::shared_ptr<Array>> RecordBatch::MakeStatisticsArray(
@@ -580,7 +597,7 @@ Result<std::shared_ptr<Array>> RecordBatch::MakeStatisticsArray(
580597
RETURN_NOT_OK(EnumerateStatistics(*this, [&](const EnumeratedStatistics& statistics) {
581598
int8_t i = 0;
582599
for (const auto& field : values_types) {
583-
if (field->type()->id() == statistics.type->id()) {
600+
if (field->type()->Equals(statistics.type)) {
584601
break;
585602
}
586603
i++;
@@ -680,8 +697,8 @@ Result<std::shared_ptr<Array>> RecordBatch::MakeStatisticsArray(
680697
return static_cast<DoubleBuilder*>(builder)->Append(value);
681698
}
682699
Status operator()(const std::string& value) {
683-
return static_cast<StringBuilder*>(builder)->Append(
684-
value.data(), static_cast<int32_t>(value.size()));
700+
StringBuilderVisitor visitor;
701+
return VisitTypeInline(*builder->type(), &visitor, builder, value);
685702
}
686703
} visitor;
687704
visitor.builder = values_builders[values_type_index].get();

0 commit comments

Comments
 (0)