Skip to content

Commit 761caee

Browse files
committed
GH-47981: [C++][Parquet] Add compatibility with non-compliant RLE stream
1 parent 5112de2 commit 761caee

File tree

1 file changed

+27
-5
lines changed

1 file changed

+27
-5
lines changed

cpp/src/arrow/util/rle_encoding_internal.h

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -679,12 +679,22 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const
679679
ARROW_DCHECK_LT(static_cast<uint64_t>(count) * 8,
680680
internal::max_size_for_v<rle_size_t>);
681681
// Count Already divided by 8 for byte size calculations
682-
const auto bytes_read = header_bytes + static_cast<int64_t>(count) * value_bit_width_;
682+
auto bytes_read = header_bytes + static_cast<int64_t>(count) * value_bit_width_;
683+
auto values_count = static_cast<rle_size_t>(count * 8);
683684
if (ARROW_PREDICT_FALSE(bytes_read > data_size_)) {
684-
// Bit-packed run would overflow data buffer
685-
return {0, ControlFlow::Break};
685+
// Bit-packed run would overflow data buffer, but we might still be able
686+
// to return a truncated bit-packed such as generated by some non-compliant
687+
// encoders.
688+
// Example in GH-47981: column contains 25 5-bit values, has a single
689+
// bit-packed run with count=4 (theoretically 32 values), but only 17
690+
// bytes of RLE-bit-packed data (including the one-byte header).
691+
bytes_read = data_size_;
692+
values_count =
693+
static_cast<rle_size_t>((bytes_read - header_bytes) * 8 / value_bit_width_);
694+
if (values_count < 1) {
695+
return {0, ControlFlow::Break};
696+
}
686697
}
687-
const auto values_count = static_cast<rle_size_t>(count * 8);
688698

689699
auto control = handler.OnBitPackedRun(
690700
BitPackedRun(data_ + header_bytes, values_count, value_bit_width_));
@@ -1215,7 +1225,8 @@ auto RleBitPackedDecoder<T>::GetBatchWithDict(const V* dictionary,
12151225
rle_size_t batch_size) -> rle_size_t {
12161226
using ControlFlow = RleBitPackedParser::ControlFlow;
12171227

1218-
if (ARROW_PREDICT_FALSE(batch_size <= 0)) {
1228+
if (ARROW_PREDICT_FALSE(batch_size <= 0 || dictionary_length == 0)) {
1229+
// Either empty batch or invalid dictionary
12191230
return 0;
12201231
}
12211232

@@ -1284,6 +1295,17 @@ auto RleBitPackedDecoder<T>::GetBatchWithDictSpaced(
12841295
if (null_count == 0) {
12851296
return GetBatchWithDict<V>(dictionary, dictionary_length, out, batch_size);
12861297
}
1298+
if (null_count == batch_size) {
1299+
// All nulls, avoid instantiating DictionaryConverter as dictionary_length
1300+
// could be 0.
1301+
std::fill(out, out + batch_size, V{});
1302+
return batch_size;
1303+
}
1304+
if (ARROW_PREDICT_FALSE(batch_size <= 0 || dictionary_length == 0)) {
1305+
// Either empty batch or invalid dictionary
1306+
return 0;
1307+
}
1308+
12871309
internal::DictionaryConverter<V, value_type> converter{dictionary, dictionary_length};
12881310

12891311
return GetSpaced(converter, out, batch_size, valid_bits, valid_bits_offset, null_count);

0 commit comments

Comments
 (0)