Skip to content

Commit 2c5dda8

Browse files
committed
GH-48308: [C++][Parquet] Fix potential crash when reading invalid Parquet data
1 parent 57cb172 commit 2c5dda8

File tree

2 files changed

+46
-39
lines changed

2 files changed

+46
-39
lines changed

cpp/src/parquet/decoder.cc

Lines changed: 45 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,7 @@ struct ArrowBinaryHelper<ByteArrayType, ::arrow::BinaryType> {
9292
// If estimated_data_length is provided, it will also reserve the estimated data length.
9393
Status Prepare(int64_t length, std::optional<int64_t> estimated_data_length = {}) {
9494
entries_remaining_ = length;
95-
RETURN_NOT_OK(builder_->Reserve(entries_remaining_));
96-
if (estimated_data_length.has_value()) {
97-
RETURN_NOT_OK(builder_->ReserveData(
98-
std::min<int64_t>(*estimated_data_length, this->chunk_space_remaining_)));
99-
}
95+
RETURN_NOT_OK(ReserveInitialChunkData(estimated_data_length));
10096
return Status::OK();
10197
}
10298

@@ -110,11 +106,7 @@ struct ArrowBinaryHelper<ByteArrayType, ::arrow::BinaryType> {
110106
// This element would exceed the capacity of a chunk
111107
RETURN_NOT_OK(PushChunk());
112108
// Reserve entries and data in new chunk
113-
RETURN_NOT_OK(builder_->Reserve(entries_remaining_));
114-
if (estimated_remaining_data_length.has_value()) {
115-
RETURN_NOT_OK(builder_->ReserveData(
116-
std::min<int64_t>(*estimated_remaining_data_length, chunk_space_remaining_)));
117-
}
109+
RETURN_NOT_OK(ReserveInitialChunkData(estimated_remaining_data_length));
118110
}
119111
chunk_space_remaining_ -= length;
120112
--entries_remaining_;
@@ -147,6 +139,20 @@ struct ArrowBinaryHelper<ByteArrayType, ::arrow::BinaryType> {
147139
return Status::OK();
148140
}
149141

142+
Status ReserveInitialChunkData(std::optional<int64_t> estimated_remaining_data_length) {
143+
RETURN_NOT_OK(builder_->Reserve(entries_remaining_));
144+
if (estimated_remaining_data_length.has_value()) {
145+
int64_t required_capacity =
146+
std::min(*estimated_remaining_data_length, chunk_space_remaining_);
147+
// Ensure we'll be allocating a non-zero-sized data buffer, to avoid encountering
148+
// a null pointer
149+
constexpr int64_t kMinReserveCapacity = 64;
150+
RETURN_NOT_OK(
151+
builder_->ReserveData(std::max(required_capacity, kMinReserveCapacity)));
152+
}
153+
return Status::OK();
154+
}
155+
150156
bool CanFit(int64_t length) const { return length <= chunk_space_remaining_; }
151157

152158
Accumulator* acc_;
@@ -763,34 +769,35 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType> {
763769
auto visit_binary_helper = [&](auto* helper) {
764770
int values_decoded = 0;
765771

766-
RETURN_NOT_OK(VisitBitRuns(
767-
valid_bits, valid_bits_offset, num_values,
768-
[&](int64_t position, int64_t run_length, bool is_valid) {
769-
if (is_valid) {
770-
for (int64_t i = 0; i < run_length; ++i) {
771-
if (ARROW_PREDICT_FALSE(len_ < 4)) {
772-
return Status::Invalid(
773-
"Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
774-
}
775-
auto value_len = SafeLoadAs<int32_t>(data_);
776-
if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > len_ - 4)) {
777-
return Status::Invalid(
778-
"Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
779-
}
780-
RETURN_NOT_OK(
781-
helper->AppendValue(data_ + 4, value_len, estimated_data_length));
782-
auto increment = value_len + 4;
783-
data_ += increment;
784-
len_ -= increment;
785-
estimated_data_length -= value_len;
786-
DCHECK_GE(estimated_data_length, 0);
787-
}
788-
values_decoded += static_cast<int>(run_length);
789-
return Status::OK();
790-
} else {
791-
return helper->AppendNulls(run_length);
792-
}
793-
}));
772+
RETURN_NOT_OK(
773+
VisitBitRuns(valid_bits, valid_bits_offset, num_values,
774+
[&](int64_t position, int64_t run_length, bool is_valid) {
775+
if (is_valid) {
776+
for (int64_t i = 0; i < run_length; ++i) {
777+
if (ARROW_PREDICT_FALSE(len_ < 4)) {
778+
return Status::Invalid(
779+
"Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
780+
}
781+
auto value_len = SafeLoadAs<int32_t>(data_);
782+
if (ARROW_PREDICT_FALSE(value_len < 0 ||
783+
value_len > estimated_data_length)) {
784+
return Status::Invalid(
785+
"Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
786+
}
787+
RETURN_NOT_OK(helper->AppendValue(data_ + 4, value_len,
788+
estimated_data_length));
789+
auto increment = value_len + 4;
790+
data_ += increment;
791+
len_ -= increment;
792+
estimated_data_length -= value_len;
793+
DCHECK_GE(estimated_data_length, 0);
794+
}
795+
values_decoded += static_cast<int>(run_length);
796+
return Status::OK();
797+
} else {
798+
return helper->AppendNulls(run_length);
799+
}
800+
}));
794801

795802
num_values_ -= values_decoded;
796803
*out_values_decoded = values_decoded;

0 commit comments

Comments
 (0)