@@ -92,11 +92,7 @@ struct ArrowBinaryHelper<ByteArrayType, ::arrow::BinaryType> {
9292 // If estimated_data_length is provided, it will also reserve the estimated data length.
9393 Status Prepare (int64_t length, std::optional<int64_t > estimated_data_length = {}) {
9494 entries_remaining_ = length;
95- RETURN_NOT_OK (builder_->Reserve (entries_remaining_));
96- if (estimated_data_length.has_value ()) {
97- RETURN_NOT_OK (builder_->ReserveData (
98- std::min<int64_t >(*estimated_data_length, this ->chunk_space_remaining_ )));
99- }
95+ RETURN_NOT_OK (ReserveInitialChunkData (estimated_data_length));
10096 return Status::OK ();
10197 }
10298
@@ -110,11 +106,7 @@ struct ArrowBinaryHelper<ByteArrayType, ::arrow::BinaryType> {
110106 // This element would exceed the capacity of a chunk
111107 RETURN_NOT_OK (PushChunk ());
112108 // Reserve entries and data in new chunk
113- RETURN_NOT_OK (builder_->Reserve (entries_remaining_));
114- if (estimated_remaining_data_length.has_value ()) {
115- RETURN_NOT_OK (builder_->ReserveData (
116- std::min<int64_t >(*estimated_remaining_data_length, chunk_space_remaining_)));
117- }
109+ RETURN_NOT_OK (ReserveInitialChunkData (estimated_remaining_data_length));
118110 }
119111 chunk_space_remaining_ -= length;
120112 --entries_remaining_;
@@ -147,6 +139,20 @@ struct ArrowBinaryHelper<ByteArrayType, ::arrow::BinaryType> {
147139 return Status::OK ();
148140 }
149141
142+ Status ReserveInitialChunkData (std::optional<int64_t > estimated_remaining_data_length) {
143+ RETURN_NOT_OK (builder_->Reserve (entries_remaining_));
144+ if (estimated_remaining_data_length.has_value ()) {
145+ int64_t required_capacity =
146+ std::min (*estimated_remaining_data_length, chunk_space_remaining_);
147+ // Ensure we'll be allocating a non-zero-sized data buffer, to avoid encountering
148+ // a null pointer
149+ constexpr int64_t kMinReserveCapacity = 64 ;
150+ RETURN_NOT_OK (
151+ builder_->ReserveData (std::max (required_capacity, kMinReserveCapacity )));
152+ }
153+ return Status::OK ();
154+ }
155+
150156 bool CanFit (int64_t length) const { return length <= chunk_space_remaining_; }
151157
152158 Accumulator* acc_;
@@ -763,34 +769,35 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType> {
763769 auto visit_binary_helper = [&](auto * helper) {
764770 int values_decoded = 0 ;
765771
766- RETURN_NOT_OK (VisitBitRuns (
767- valid_bits, valid_bits_offset, num_values,
768- [&](int64_t position, int64_t run_length, bool is_valid) {
769- if (is_valid) {
770- for (int64_t i = 0 ; i < run_length; ++i) {
771- if (ARROW_PREDICT_FALSE (len_ < 4 )) {
772- return Status::Invalid (
773- " Invalid or truncated PLAIN-encoded BYTE_ARRAY data" );
774- }
775- auto value_len = SafeLoadAs<int32_t >(data_);
776- if (ARROW_PREDICT_FALSE (value_len < 0 || value_len > len_ - 4 )) {
777- return Status::Invalid (
778- " Invalid or truncated PLAIN-encoded BYTE_ARRAY data" );
779- }
780- RETURN_NOT_OK (
781- helper->AppendValue (data_ + 4 , value_len, estimated_data_length));
782- auto increment = value_len + 4 ;
783- data_ += increment;
784- len_ -= increment;
785- estimated_data_length -= value_len;
786- DCHECK_GE (estimated_data_length, 0 );
787- }
788- values_decoded += static_cast <int >(run_length);
789- return Status::OK ();
790- } else {
791- return helper->AppendNulls (run_length);
792- }
793- }));
772+ RETURN_NOT_OK (
773+ VisitBitRuns (valid_bits, valid_bits_offset, num_values,
774+ [&](int64_t position, int64_t run_length, bool is_valid) {
775+ if (is_valid) {
776+ for (int64_t i = 0 ; i < run_length; ++i) {
777+ if (ARROW_PREDICT_FALSE (len_ < 4 )) {
778+ return Status::Invalid (
779+ " Invalid or truncated PLAIN-encoded BYTE_ARRAY data" );
780+ }
781+ auto value_len = SafeLoadAs<int32_t >(data_);
782+ if (ARROW_PREDICT_FALSE (value_len < 0 ||
783+ value_len > estimated_data_length)) {
784+ return Status::Invalid (
785+ " Invalid or truncated PLAIN-encoded BYTE_ARRAY data" );
786+ }
787+ RETURN_NOT_OK (helper->AppendValue (data_ + 4 , value_len,
788+ estimated_data_length));
789+ auto increment = value_len + 4 ;
790+ data_ += increment;
791+ len_ -= increment;
792+ estimated_data_length -= value_len;
793+ DCHECK_GE (estimated_data_length, 0 );
794+ }
795+ values_decoded += static_cast <int >(run_length);
796+ return Status::OK ();
797+ } else {
798+ return helper->AppendNulls (run_length);
799+ }
800+ }));
794801
795802 num_values_ -= values_decoded;
796803 *out_values_decoded = values_decoded;
0 commit comments