Skip to content

Commit 11d64b5

Browse files
authored
GH-48245: [C++][Parquet] Simplify GetVlqInt (#48237)
### Rationale for this change The `BitReader::GetVlqInt` implementation currently tries to read first from the cached value before falling back to reading from the buffer. But this doesn't bring any benefit, since both code paths lead to the same processing step afterwards. So we can remove the code path that tries to read from the cached value. This will also make it easier to support big-endian platforms. ### Are these changes tested? Yes, by existing tests. ### Are there any user-facing changes? No. Authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent a17e247 commit 11d64b5

File tree

1 file changed

+21
-21
lines changed

1 file changed

+21
-21
lines changed

cpp/src/arrow/util/bit_stream_utils_internal.h

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,10 @@ class BitReader {
160160
/// are not enough bits left.
161161
bool Advance(int64_t num_bits);
162162

163+
/// Advance the stream by a number of bytes, ignoring remaning bits.
164+
/// Returns true if succeed or false if there are not enough bits left.
165+
bool AdvanceBytes(int num_bytes);
166+
163167
/// Reads a vlq encoded int from the stream. The encoded int must start at
164168
/// the beginning of a byte. Return false if there were not enough bytes in
165169
/// the buffer.
@@ -328,6 +332,17 @@ inline bool BitReader::Advance(int64_t num_bits) {
328332
return true;
329333
}
330334

335+
inline bool BitReader::AdvanceBytes(int num_bytes) {
336+
if (ARROW_PREDICT_FALSE(num_bytes > max_bytes_ - byte_offset_)) {
337+
return false;
338+
}
339+
byte_offset_ += num_bytes;
340+
bit_offset_ = 0;
341+
buffered_values_ =
342+
detail::ReadLittleEndianWord(buffer_ + byte_offset_, max_bytes_ - byte_offset_);
343+
return true;
344+
}
345+
331346
template <typename Int>
332347
inline bool BitWriter::PutVlqInt(Int v) {
333348
static_assert(std::is_integral_v<Int>);
@@ -362,34 +377,19 @@ inline bool BitReader::GetVlqInt(Int* v) {
362377
static_assert(std::is_integral_v<Int>);
363378

364379
// The data that we will pass to the LEB128 parser
365-
// In all case, we read a byte-aligned value, skipping remaining bits
366-
const uint8_t* data = NULLPTR;
367-
int max_size = 0;
368-
369-
// Number of bytes left in the buffered values, not including the current
370-
// byte (i.e., there may be an additional fraction of a byte).
371-
const int bytes_left_in_cache =
372-
sizeof(buffered_values_) - static_cast<int>(bit_util::BytesForBits(bit_offset_));
373-
374-
// If there are clearly enough bytes left we can try to parse from the cache
375-
if (bytes_left_in_cache >= kMaxLEB128ByteLenFor<Int>) {
376-
max_size = bytes_left_in_cache;
377-
data = reinterpret_cast<const uint8_t*>(&buffered_values_) +
378-
bit_util::BytesForBits(bit_offset_);
379-
// Otherwise, we try straight from buffer (ignoring few bytes that may be cached)
380-
} else {
381-
max_size = bytes_left();
382-
data = buffer_ + (max_bytes_ - max_size);
383-
}
380+
// We read a byte-aligned value, skipping remaining bits.
381+
// Also, we don't bother with the cache since the decoding would be the same.
382+
int max_size = bytes_left();
383+
const uint8_t* data = buffer_ + (max_bytes_ - max_size);
384384

385385
const auto bytes_read = bit_util::ParseLeadingLEB128(data, max_size, v);
386386
if (ARROW_PREDICT_FALSE(bytes_read == 0)) {
387387
// Corrupt LEB128
388388
return false;
389389
}
390390

391-
// Advance for the bytes we have read + the bits we skipped
392-
return Advance((8 * bytes_read) + (bit_offset_ % 8));
391+
// Advance for the bytes we have read
392+
return AdvanceBytes(bytes_read);
393393
}
394394

395395
template <typename Int>

0 commit comments

Comments
 (0)