Skip to content

Commit b3eadff

Browse files
committed
GH-47981: [C++][Parquet] Add compatibility with non-compliant RLE stream
1 parent 6a08785 commit b3eadff

File tree

1 file changed

+27
-5
lines changed

1 file changed

+27
-5
lines changed

cpp/src/arrow/util/rle_encoding_internal.h

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -674,12 +674,22 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const
674674
ARROW_DCHECK_LT(static_cast<uint64_t>(count) * 8,
675675
internal::max_size_for_v<rle_size_t>);
676676
// Count Already divided by 8 for byte size calculations
677-
const auto bytes_read = header_bytes + static_cast<int64_t>(count) * value_bit_width_;
677+
auto bytes_read = header_bytes + static_cast<int64_t>(count) * value_bit_width_;
678+
auto values_count = static_cast<rle_size_t>(count * 8);
678679
if (ARROW_PREDICT_FALSE(bytes_read > data_size_)) {
679-
// Bit-packed run would overflow data buffer
680-
return {0, ControlFlow::Break};
680+
// Bit-packed run would overflow data buffer, but we might still be able
681+
// to return a truncated bit-packed such as generated by some non-compliant
682+
// encoders.
683+
// Example in GH-47981: column contains 25 5-bit values, has a single
684+
// bit-packed run with count=4 (theoretically 32 values), but only 17
685+
// bytes of RLE-bit-packed data (including the one-byte header).
686+
bytes_read = data_size_;
687+
values_count =
688+
static_cast<rle_size_t>((bytes_read - header_bytes) * 8 / value_bit_width_);
689+
if (values_count < 1) {
690+
return {0, ControlFlow::Break};
691+
}
681692
}
682-
const auto values_count = static_cast<rle_size_t>(count * 8);
683693

684694
auto control = handler.OnBitPackedRun(
685695
BitPackedRun(data_ + header_bytes, values_count, value_bit_width_));
@@ -1210,7 +1220,8 @@ auto RleBitPackedDecoder<T>::GetBatchWithDict(const V* dictionary,
12101220
rle_size_t batch_size) -> rle_size_t {
12111221
using ControlFlow = RleBitPackedParser::ControlFlow;
12121222

1213-
if (ARROW_PREDICT_FALSE(batch_size <= 0)) {
1223+
if (ARROW_PREDICT_FALSE(batch_size <= 0 || dictionary_length == 0)) {
1224+
// Either empty batch or invalid dictionary
12141225
return 0;
12151226
}
12161227

@@ -1279,6 +1290,17 @@ auto RleBitPackedDecoder<T>::GetBatchWithDictSpaced(
12791290
if (null_count == 0) {
12801291
return GetBatchWithDict<V>(dictionary, dictionary_length, out, batch_size);
12811292
}
1293+
if (null_count == batch_size) {
1294+
// All nulls, avoid instantiating DictionaryConverter as dictionary_length
1295+
// could be 0.
1296+
std::fill(out, out + batch_size, V{});
1297+
return batch_size;
1298+
}
1299+
if (ARROW_PREDICT_FALSE(batch_size <= 0 || dictionary_length == 0)) {
1300+
// Either empty batch or invalid dictionary
1301+
return 0;
1302+
}
1303+
12821304
internal::DictionaryConverter<V, value_type> converter{dictionary, dictionary_length};
12831305

12841306
return GetSpaced(converter, out, batch_size, valid_bits, valid_bits_offset, null_count);

0 commit comments

Comments
 (0)