@@ -92,11 +92,7 @@ struct ArrowBinaryHelper<ByteArrayType, ::arrow::BinaryType> {
9292 // If estimated_data_length is provided, it will also reserve the estimated data length.
9393 Status Prepare (int64_t length, std::optional<int64_t > estimated_data_length = {}) {
9494 entries_remaining_ = length;
95- RETURN_NOT_OK (builder_->Reserve (entries_remaining_));
96- if (estimated_data_length.has_value ()) {
97- RETURN_NOT_OK (builder_->ReserveData (
98- std::min<int64_t >(*estimated_data_length, this ->chunk_space_remaining_ )));
99- }
95+ RETURN_NOT_OK (ReserveInitialChunkData (estimated_data_length));
10096 return Status::OK ();
10197 }
10298
@@ -110,11 +106,7 @@ struct ArrowBinaryHelper<ByteArrayType, ::arrow::BinaryType> {
110106 // This element would exceed the capacity of a chunk
111107 RETURN_NOT_OK (PushChunk ());
112108 // Reserve entries and data in new chunk
113- RETURN_NOT_OK (builder_->Reserve (entries_remaining_));
114- if (estimated_remaining_data_length.has_value ()) {
115- RETURN_NOT_OK (builder_->ReserveData (
116- std::min<int64_t >(*estimated_remaining_data_length, chunk_space_remaining_)));
117- }
109+ RETURN_NOT_OK (ReserveInitialChunkData (estimated_remaining_data_length));
118110 }
119111 chunk_space_remaining_ -= length;
120112 --entries_remaining_;
@@ -147,6 +139,16 @@ struct ArrowBinaryHelper<ByteArrayType, ::arrow::BinaryType> {
147139 return Status::OK ();
148140 }
149141
142+ Status ReserveInitialChunkData (std::optional<int64_t > estimated_remaining_data_length) {
143+ RETURN_NOT_OK (builder_->Reserve (entries_remaining_));
144+ if (estimated_remaining_data_length.has_value ()) {
145+ int64_t required_capacity =
146+ std::min (*estimated_remaining_data_length, chunk_space_remaining_);
147+ RETURN_NOT_OK (builder_->ReserveData (required_capacity));
148+ }
149+ return Status::OK ();
150+ }
151+
150152 bool CanFit (int64_t length) const { return length <= chunk_space_remaining_; }
151153
152154 Accumulator* acc_;
@@ -754,43 +756,50 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType> {
754756 int64_t valid_bits_offset,
755757 typename EncodingTraits<ByteArrayType>::Accumulator* out,
756758 int * out_values_decoded) {
757- // We're going to decode up to `num_values - null_count` PLAIN values,
759+ // We're going to decode `num_values - null_count` PLAIN values,
758760 // and each value has a 4-byte length header that doesn't count for the
759761 // Arrow binary data length.
760- int64_t estimated_data_length =
761- std::max<int64_t >(0 , len_ - 4 * (num_values - null_count));
762+ int64_t estimated_data_length = len_ - 4 * (num_values - null_count);
763+ if (ARROW_PREDICT_FALSE (estimated_data_length < 0 )) {
764+ return Status::Invalid (" Invalid or truncated PLAIN-encoded BYTE_ARRAY data" );
765+ }
762766
763767 auto visit_binary_helper = [&](auto * helper) {
764768 int values_decoded = 0 ;
765769
766- RETURN_NOT_OK (VisitBitRuns (
767- valid_bits, valid_bits_offset, num_values,
768- [&](int64_t position, int64_t run_length, bool is_valid) {
769- if (is_valid) {
770- for (int64_t i = 0 ; i < run_length; ++i) {
771- if (ARROW_PREDICT_FALSE (len_ < 4 )) {
772- return Status::Invalid (
773- " Invalid or truncated PLAIN-encoded BYTE_ARRAY data" );
774- }
775- auto value_len = SafeLoadAs<int32_t >(data_);
776- if (ARROW_PREDICT_FALSE (value_len < 0 || value_len > len_ - 4 )) {
777- return Status::Invalid (
778- " Invalid or truncated PLAIN-encoded BYTE_ARRAY data" );
779- }
780- RETURN_NOT_OK (
781- helper->AppendValue (data_ + 4 , value_len, estimated_data_length));
782- auto increment = value_len + 4 ;
783- data_ += increment;
784- len_ -= increment;
785- estimated_data_length -= value_len;
786- DCHECK_GE (estimated_data_length, 0 );
787- }
788- values_decoded += static_cast <int >(run_length);
789- return Status::OK ();
790- } else {
791- return helper->AppendNulls (run_length);
770+ auto visit_run = [&](int64_t position, int64_t run_length, bool is_valid) {
771+ if (is_valid) {
772+ for (int64_t i = 0 ; i < run_length; ++i) {
773+ // We ensure `len_` is sufficient thanks to:
774+ // 1. the initial `estimated_data_length` check above,
775+ // 2. the running `value_len > estimated_data_length` check below.
776+ // This precondition follows from those two checks.
777+ DCHECK_GE (len_, 4 );
778+ auto value_len = SafeLoadAs<int32_t >(data_);
779+ // This check also ensures that `value_len <= len_ - 4` due to the way
780+ // `estimated_data_length` is computed.
781+ if (ARROW_PREDICT_FALSE (value_len < 0 || value_len > estimated_data_length)) {
782+ return Status::Invalid (
783+ " Invalid or truncated PLAIN-encoded BYTE_ARRAY data" );
792784 }
793- }));
785+ RETURN_NOT_OK (
786+ helper->AppendValue (data_ + 4 , value_len, estimated_data_length));
787+ auto increment = value_len + 4 ;
788+ data_ += increment;
789+ len_ -= increment;
790+ estimated_data_length -= value_len;
791+ DCHECK_GE (estimated_data_length, 0 );
792+ }
793+ values_decoded += static_cast <int >(run_length);
794+ DCHECK_LE (values_decoded, num_values);
795+ return Status::OK ();
796+ } else {
797+ return helper->AppendNulls (run_length);
798+ }
799+ };
800+
801+ RETURN_NOT_OK (
802+ VisitBitRuns (valid_bits, valid_bits_offset, num_values, std::move (visit_run)));
794803
795804 num_values_ -= values_decoded;
796805 *out_values_decoded = values_decoded;
0 commit comments