@@ -674,12 +674,22 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const
674674 ARROW_DCHECK_LT (static_cast <uint64_t >(count) * 8 ,
675675 internal::max_size_for_v<rle_size_t >);
676676 // Count Already divided by 8 for byte size calculations
677- const auto bytes_read = header_bytes + static_cast <int64_t >(count) * value_bit_width_;
677+ auto bytes_read = header_bytes + static_cast <int64_t >(count) * value_bit_width_;
678+ auto values_count = static_cast <rle_size_t >(count * 8 );
678679 if (ARROW_PREDICT_FALSE (bytes_read > data_size_)) {
679- // Bit-packed run would overflow data buffer
680- return {0 , ControlFlow::Break};
680+ // Bit-packed run would overflow data buffer, but we might still be able
681+ // to return a truncated bit-packed such as generated by some non-compliant
682+ // encoders.
683+ // Example in GH-47981: column contains 25 5-bit values, has a single
684+ // bit-packed run with count=4 (theoretically 32 values), but only 17
685+ // bytes of RLE-bit-packed data (including the one-byte header).
686+ bytes_read = data_size_;
687+ values_count =
688+ static_cast <rle_size_t >((bytes_read - header_bytes) * 8 / value_bit_width_);
689+ if (values_count < 1 ) {
690+ return {0 , ControlFlow::Break};
691+ }
681692 }
682- const auto values_count = static_cast <rle_size_t >(count * 8 );
683693
684694 auto control = handler.OnBitPackedRun (
685695 BitPackedRun (data_ + header_bytes, values_count, value_bit_width_));
@@ -1210,7 +1220,8 @@ auto RleBitPackedDecoder<T>::GetBatchWithDict(const V* dictionary,
12101220 rle_size_t batch_size) -> rle_size_t {
12111221 using ControlFlow = RleBitPackedParser::ControlFlow;
12121222
1213- if (ARROW_PREDICT_FALSE (batch_size <= 0 )) {
1223+ if (ARROW_PREDICT_FALSE (batch_size <= 0 || dictionary_length == 0 )) {
1224+ // Either empty batch or invalid dictionary
12141225 return 0 ;
12151226 }
12161227
@@ -1279,6 +1290,17 @@ auto RleBitPackedDecoder<T>::GetBatchWithDictSpaced(
12791290 if (null_count == 0 ) {
12801291 return GetBatchWithDict<V>(dictionary, dictionary_length, out, batch_size);
12811292 }
1293+ if (null_count == batch_size) {
1294+ // All nulls, avoid instantiating DictionaryConverter as dictionary_length
1295+ // could be 0.
1296+ std::fill (out, out + batch_size, V{});
1297+ return batch_size;
1298+ }
1299+ if (ARROW_PREDICT_FALSE (batch_size <= 0 || dictionary_length == 0 )) {
1300+ // Either empty batch or invalid dictionary
1301+ return 0 ;
1302+ }
1303+
12821304 internal::DictionaryConverter<V, value_type> converter{dictionary, dictionary_length};
12831305
12841306 return GetSpaced (converter, out, batch_size, valid_bits, valid_bits_offset, null_count);
0 commit comments