Skip to content

Commit ea6dc7b

Browse files
authored
GH-48308: [C++][Parquet] Fix potential crash when reading invalid Parquet data (#48309)
### Rationale for this change 1. Make value length validation stricter in PLAIN decoding of BYTE_ARRAY columns. 2. Ensure BinaryBuilder always reserves a non-zero data length, to avoid encountering a null data pointer when unsafe-appending values. This should fix https://oss-fuzz.com/testcase-detail/4570087119192064 ### Are these changes tested? Yes, by additional fuzz regression file. ### Are there any user-facing changes? No. **This PR contains a "Critical Fix".** (If the changes fix either (a) a security vulnerability, (b) a bug that caused incorrect or invalid data to be produced, or (c) a bug that causes a crash (even when the API contract is upheld), please provide explanation. If not, you can remove this.) * GitHub Issue: #48308 Authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 79695c2 commit ea6dc7b

File tree

2 files changed

+50
-41
lines changed

2 files changed

+50
-41
lines changed

cpp/src/parquet/decoder.cc

Lines changed: 49 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,7 @@ struct ArrowBinaryHelper<ByteArrayType, ::arrow::BinaryType> {
9292
// If estimated_data_length is provided, it will also reserve the estimated data length.
9393
Status Prepare(int64_t length, std::optional<int64_t> estimated_data_length = {}) {
9494
entries_remaining_ = length;
95-
RETURN_NOT_OK(builder_->Reserve(entries_remaining_));
96-
if (estimated_data_length.has_value()) {
97-
RETURN_NOT_OK(builder_->ReserveData(
98-
std::min<int64_t>(*estimated_data_length, this->chunk_space_remaining_)));
99-
}
95+
RETURN_NOT_OK(ReserveInitialChunkData(estimated_data_length));
10096
return Status::OK();
10197
}
10298

@@ -110,11 +106,7 @@ struct ArrowBinaryHelper<ByteArrayType, ::arrow::BinaryType> {
110106
// This element would exceed the capacity of a chunk
111107
RETURN_NOT_OK(PushChunk());
112108
// Reserve entries and data in new chunk
113-
RETURN_NOT_OK(builder_->Reserve(entries_remaining_));
114-
if (estimated_remaining_data_length.has_value()) {
115-
RETURN_NOT_OK(builder_->ReserveData(
116-
std::min<int64_t>(*estimated_remaining_data_length, chunk_space_remaining_)));
117-
}
109+
RETURN_NOT_OK(ReserveInitialChunkData(estimated_remaining_data_length));
118110
}
119111
chunk_space_remaining_ -= length;
120112
--entries_remaining_;
@@ -147,6 +139,16 @@ struct ArrowBinaryHelper<ByteArrayType, ::arrow::BinaryType> {
147139
return Status::OK();
148140
}
149141

142+
Status ReserveInitialChunkData(std::optional<int64_t> estimated_remaining_data_length) {
143+
RETURN_NOT_OK(builder_->Reserve(entries_remaining_));
144+
if (estimated_remaining_data_length.has_value()) {
145+
int64_t required_capacity =
146+
std::min(*estimated_remaining_data_length, chunk_space_remaining_);
147+
RETURN_NOT_OK(builder_->ReserveData(required_capacity));
148+
}
149+
return Status::OK();
150+
}
151+
150152
bool CanFit(int64_t length) const { return length <= chunk_space_remaining_; }
151153

152154
Accumulator* acc_;
@@ -754,43 +756,50 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType> {
754756
int64_t valid_bits_offset,
755757
typename EncodingTraits<ByteArrayType>::Accumulator* out,
756758
int* out_values_decoded) {
757-
// We're going to decode up to `num_values - null_count` PLAIN values,
759+
// We're going to decode `num_values - null_count` PLAIN values,
758760
// and each value has a 4-byte length header that doesn't count for the
759761
// Arrow binary data length.
760-
int64_t estimated_data_length =
761-
std::max<int64_t>(0, len_ - 4 * (num_values - null_count));
762+
int64_t estimated_data_length = len_ - 4 * (num_values - null_count);
763+
if (ARROW_PREDICT_FALSE(estimated_data_length < 0)) {
764+
return Status::Invalid("Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
765+
}
762766

763767
auto visit_binary_helper = [&](auto* helper) {
764768
int values_decoded = 0;
765769

766-
RETURN_NOT_OK(VisitBitRuns(
767-
valid_bits, valid_bits_offset, num_values,
768-
[&](int64_t position, int64_t run_length, bool is_valid) {
769-
if (is_valid) {
770-
for (int64_t i = 0; i < run_length; ++i) {
771-
if (ARROW_PREDICT_FALSE(len_ < 4)) {
772-
return Status::Invalid(
773-
"Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
774-
}
775-
auto value_len = SafeLoadAs<int32_t>(data_);
776-
if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > len_ - 4)) {
777-
return Status::Invalid(
778-
"Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
779-
}
780-
RETURN_NOT_OK(
781-
helper->AppendValue(data_ + 4, value_len, estimated_data_length));
782-
auto increment = value_len + 4;
783-
data_ += increment;
784-
len_ -= increment;
785-
estimated_data_length -= value_len;
786-
DCHECK_GE(estimated_data_length, 0);
787-
}
788-
values_decoded += static_cast<int>(run_length);
789-
return Status::OK();
790-
} else {
791-
return helper->AppendNulls(run_length);
770+
auto visit_run = [&](int64_t position, int64_t run_length, bool is_valid) {
771+
if (is_valid) {
772+
for (int64_t i = 0; i < run_length; ++i) {
773+
// We ensure `len_` is sufficient thanks to:
774+
// 1. the initial `estimated_data_length` check above,
775+
// 2. the running `value_len > estimated_data_length` check below.
776+
// This precondition follows from those two checks.
777+
DCHECK_GE(len_, 4);
778+
auto value_len = SafeLoadAs<int32_t>(data_);
779+
// This check also ensures that `value_len <= len_ - 4` due to the way
780+
// `estimated_data_length` is computed.
781+
if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > estimated_data_length)) {
782+
return Status::Invalid(
783+
"Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
792784
}
793-
}));
785+
RETURN_NOT_OK(
786+
helper->AppendValue(data_ + 4, value_len, estimated_data_length));
787+
auto increment = value_len + 4;
788+
data_ += increment;
789+
len_ -= increment;
790+
estimated_data_length -= value_len;
791+
DCHECK_GE(estimated_data_length, 0);
792+
}
793+
values_decoded += static_cast<int>(run_length);
794+
DCHECK_LE(values_decoded, num_values);
795+
return Status::OK();
796+
} else {
797+
return helper->AppendNulls(run_length);
798+
}
799+
};
800+
801+
RETURN_NOT_OK(
802+
VisitBitRuns(valid_bits, valid_bits_offset, num_values, std::move(visit_run)));
794803

795804
num_values_ -= values_decoded;
796805
*out_values_decoded = values_decoded;

0 commit comments

Comments
 (0)