@@ -679,12 +679,22 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const
679679 ARROW_DCHECK_LT (static_cast <uint64_t >(count) * 8 ,
680680 internal::max_size_for_v<rle_size_t >);
681681 // Count Already divided by 8 for byte size calculations
682- const auto bytes_read = header_bytes + static_cast <int64_t >(count) * value_bit_width_;
682+ auto bytes_read = header_bytes + static_cast <int64_t >(count) * value_bit_width_;
683+ auto values_count = static_cast <rle_size_t >(count * 8 );
683684 if (ARROW_PREDICT_FALSE (bytes_read > data_size_)) {
684- // Bit-packed run would overflow data buffer
685- return {0 , ControlFlow::Break};
685+ // Bit-packed run would overflow data buffer, but we might still be able
686+ // to return a truncated bit-packed such as generated by some non-compliant
687+ // encoders.
688+ // Example in GH-47981: column contains 25 5-bit values, has a single
689+ // bit-packed run with count=4 (theoretically 32 values), but only 17
690+ // bytes of RLE-bit-packed data (including the one-byte header).
691+ bytes_read = data_size_;
692+ values_count =
693+ static_cast <rle_size_t >((bytes_read - header_bytes) * 8 / value_bit_width_);
694+ if (values_count < 1 ) {
695+ return {0 , ControlFlow::Break};
696+ }
686697 }
687- const auto values_count = static_cast <rle_size_t >(count * 8 );
688698
689699 auto control = handler.OnBitPackedRun (
690700 BitPackedRun (data_ + header_bytes, values_count, value_bit_width_));
@@ -1215,7 +1225,8 @@ auto RleBitPackedDecoder<T>::GetBatchWithDict(const V* dictionary,
12151225 rle_size_t batch_size) -> rle_size_t {
12161226 using ControlFlow = RleBitPackedParser::ControlFlow;
12171227
1218- if (ARROW_PREDICT_FALSE (batch_size <= 0 )) {
1228+ if (ARROW_PREDICT_FALSE (batch_size <= 0 || dictionary_length == 0 )) {
1229+ // Either empty batch or invalid dictionary
12191230 return 0 ;
12201231 }
12211232
@@ -1284,6 +1295,17 @@ auto RleBitPackedDecoder<T>::GetBatchWithDictSpaced(
12841295 if (null_count == 0 ) {
12851296 return GetBatchWithDict<V>(dictionary, dictionary_length, out, batch_size);
12861297 }
1298+ if (null_count == batch_size) {
1299+ // All nulls, avoid instantiating DictionaryConverter as dictionary_length
1300+ // could be 0.
1301+ std::fill (out, out + batch_size, V{});
1302+ return batch_size;
1303+ }
1304+ if (ARROW_PREDICT_FALSE (batch_size <= 0 || dictionary_length == 0 )) {
1305+ // Either empty batch or invalid dictionary
1306+ return 0 ;
1307+ }
1308+
12871309 internal::DictionaryConverter<V, value_type> converter{dictionary, dictionary_length};
12881310
12891311 return GetSpaced (converter, out, batch_size, valid_bits, valid_bits_offset, null_count);
0 commit comments