Skip to content

Commit 79f9764

Browse files
authored
GH-47112: [Parquet][C++] Rle BitPacked parser (#47294)
### Rationale for this change ### What changes are included in this PR? New independent abstractions: - A `BitPackedRun` to describe the encoded bytes in a bit packed run. - A minimal `BitPackedDecoder` that can decode this type of run (no dict/spaced methods). - A `RleRun` to describe the encoded value in a RLE run. - A minimal `RleDecoder` that can decode this type of run (no dict/spaced methods). - A `RleBitPackedParser` that read the encoded headers and emits different runs. These new abstractions are then plugged into `RleBitPackedDecoder` (formerly `RleDecode`) to keep the compatibility with the rest of Arrow (improvements to using the parser independently can come in follow-up PR). Misc changes: - Separation of LEB128 reading/writing from `BitReader` into a free functions, and add check for a special case for handling undefined behavior overflow. ### Are these changes tested? Yes, on top of the existing tests, many more unit tests have been added. ### Are there any user-facing changes? API changes to internal classes. * GitHub Issue: #47112 Authored-by: AntoinePrv <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 64f2055 commit 79f9764

14 files changed

+2004
-578
lines changed

CPPLINT.cfg

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,5 +26,7 @@ filter = -readability/alt_tokens
2626
filter = -readability/casting
2727
filter = -readability/todo
2828
filter = -runtime/references
29+
# Let the formatter do the job for whitespaces
2930
filter = -whitespace/comments
31+
filter = -whitespace/braces
3032
linelength = 90

cpp/src/arrow/util/bit_run_reader.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ inline bool operator!=(const BitRun& lhs, const BitRun& rhs) {
5252

5353
class BitRunReaderLinear {
5454
public:
55+
BitRunReaderLinear() = default;
56+
5557
BitRunReaderLinear(const uint8_t* bitmap, int64_t start_offset, int64_t length)
5658
: reader_(bitmap, start_offset, length) {}
5759

@@ -74,6 +76,8 @@ class BitRunReaderLinear {
7476
/// in a bitmap.
7577
class ARROW_EXPORT BitRunReader {
7678
public:
79+
BitRunReader() = default;
80+
7781
/// \brief Constructs new BitRunReader.
7882
///
7983
/// \param[in] bitmap source data

cpp/src/arrow/util/bit_stream_utils_internal.h

Lines changed: 82 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <algorithm>
2323
#include <cstdint>
2424
#include <cstring>
25+
#include <type_traits>
2526

2627
#include "arrow/util/bit_util.h"
2728
#include "arrow/util/bpacking_internal.h"
@@ -30,8 +31,7 @@
3031
#include "arrow/util/macros.h"
3132
#include "arrow/util/ubsan.h"
3233

33-
namespace arrow {
34-
namespace bit_util {
34+
namespace arrow::bit_util {
3535

3636
/// Utility class to write bit/byte streams. This class can write data to either be
3737
/// bit packed or byte aligned (and a single stream that has a mix of both).
@@ -73,19 +73,14 @@ class BitWriter {
7373
/// room. The value is written byte aligned.
7474
/// For more details on vlq:
7575
/// en.wikipedia.org/wiki/Variable-length_quantity
76-
bool PutVlqInt(uint32_t v);
76+
template <typename Int>
77+
bool PutVlqInt(Int v);
7778

78-
// Writes an int zigzag encoded.
79-
bool PutZigZagVlqInt(int32_t v);
80-
81-
/// Write a Vlq encoded int64 to the buffer. Returns false if there was not enough
82-
/// room. The value is written byte aligned.
83-
/// For more details on vlq:
84-
/// en.wikipedia.org/wiki/Variable-length_quantity
85-
bool PutVlqInt(uint64_t v);
86-
87-
// Writes an int64 zigzag encoded.
88-
bool PutZigZagVlqInt(int64_t v);
79+
/// Writes a zigzag encoded signed integer.
80+
/// Zigzag encoding is used to encode possibly negative numbers by alternating positive
81+
/// and negative ones.
82+
template <typename Int>
83+
bool PutZigZagVlqInt(Int v);
8984

9085
/// Get a pointer to the next aligned byte and advance the underlying buffer
9186
/// by num_bytes.
@@ -128,14 +123,14 @@ inline uint64_t ReadLittleEndianWord(const uint8_t* buffer, int bytes_remaining)
128123
/// bytes in one read (e.g. encoded int).
129124
class BitReader {
130125
public:
131-
BitReader() = default;
126+
BitReader() noexcept = default;
132127

133128
/// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'.
134129
BitReader(const uint8_t* buffer, int buffer_len) : BitReader() {
135130
Reset(buffer, buffer_len);
136131
}
137132

138-
void Reset(const uint8_t* buffer, int buffer_len) {
133+
void Reset(const uint8_t* buffer, int buffer_len) noexcept {
139134
buffer_ = buffer;
140135
max_bytes_ = buffer_len;
141136
byte_offset_ = 0;
@@ -169,18 +164,14 @@ class BitReader {
169164
/// Reads a vlq encoded int from the stream. The encoded int must start at
170165
/// the beginning of a byte. Return false if there were not enough bytes in
171166
/// the buffer.
172-
bool GetVlqInt(uint32_t* v);
173-
174-
// Reads a zigzag encoded int `into` v.
175-
bool GetZigZagVlqInt(int32_t* v);
176-
177-
/// Reads a vlq encoded int64 from the stream. The encoded int must start at
178-
/// the beginning of a byte. Return false if there were not enough bytes in
179-
/// the buffer.
180-
bool GetVlqInt(uint64_t* v);
167+
template <typename Int>
168+
bool GetVlqInt(Int* v);
181169

182-
// Reads a zigzag encoded int64 `into` v.
183-
bool GetZigZagVlqInt(int64_t* v);
170+
/// Reads a zigzag encoded integer into a signed integer output v.
171+
/// Zigzag encoding is used to decode possibly negative numbers by alternating positive
172+
/// and negative ones.
173+
template <typename Int>
174+
bool GetZigZagVlqInt(Int* v);
184175

185176
/// Returns the number of bytes left in the stream, not including the current
186177
/// byte (i.e., there may be an additional fraction of a byte).
@@ -189,12 +180,6 @@ class BitReader {
189180
(byte_offset_ + static_cast<int>(bit_util::BytesForBits(bit_offset_)));
190181
}
191182

192-
/// Maximum byte length of a vlq encoded int
193-
static constexpr int kMaxVlqByteLength = 5;
194-
195-
/// Maximum byte length of a vlq encoded int64
196-
static constexpr int kMaxVlqByteLengthForInt64 = 10;
197-
198183
private:
199184
const uint8_t* buffer_;
200185
int max_bytes_;
@@ -439,91 +424,92 @@ inline bool BitReader::Advance(int64_t num_bits) {
439424
return true;
440425
}
441426

442-
inline bool BitWriter::PutVlqInt(uint32_t v) {
443-
bool result = true;
444-
while ((v & 0xFFFFFF80UL) != 0UL) {
445-
result &= PutAligned<uint8_t>(static_cast<uint8_t>((v & 0x7F) | 0x80), 1);
446-
v >>= 7;
447-
}
448-
result &= PutAligned<uint8_t>(static_cast<uint8_t>(v & 0x7F), 1);
449-
return result;
450-
}
427+
template <typename Int>
428+
inline bool BitWriter::PutVlqInt(Int v) {
429+
static_assert(std::is_integral_v<Int>);
451430

452-
inline bool BitReader::GetVlqInt(uint32_t* v) {
453-
uint32_t tmp = 0;
431+
constexpr auto kBufferSize = kMaxLEB128ByteLenFor<Int>;
454432

455-
for (int i = 0; i < kMaxVlqByteLength; i++) {
456-
uint8_t byte = 0;
457-
if (ARROW_PREDICT_FALSE(!GetAligned<uint8_t>(1, &byte))) {
433+
uint8_t buffer[kBufferSize] = {};
434+
const auto bytes_written = WriteLEB128(v, buffer, kBufferSize);
435+
ARROW_DCHECK_LE(bytes_written, kBufferSize);
436+
if constexpr (std::is_signed_v<Int>) {
437+
// Can fail if negative
438+
if (ARROW_PREDICT_FALSE(!bytes_written == 0)) {
458439
return false;
459440
}
460-
tmp |= static_cast<uint32_t>(byte & 0x7F) << (7 * i);
441+
} else {
442+
// Cannot fail since we gave max space
443+
ARROW_DCHECK_GT(bytes_written, 0);
444+
}
461445

462-
if ((byte & 0x80) == 0) {
463-
*v = tmp;
464-
return true;
446+
for (int i = 0; i < bytes_written; ++i) {
447+
const bool success = PutAligned(buffer[i], 1);
448+
if (ARROW_PREDICT_FALSE(!success)) {
449+
return false;
465450
}
466451
}
467452

468-
return false;
469-
}
470-
471-
inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
472-
uint32_t u_v = ::arrow::util::SafeCopy<uint32_t>(v);
473-
u_v = (u_v << 1) ^ static_cast<uint32_t>(v >> 31);
474-
return PutVlqInt(u_v);
475-
}
476-
477-
inline bool BitReader::GetZigZagVlqInt(int32_t* v) {
478-
uint32_t u;
479-
if (!GetVlqInt(&u)) return false;
480-
u = (u >> 1) ^ (~(u & 1) + 1);
481-
*v = ::arrow::util::SafeCopy<int32_t>(u);
482453
return true;
483454
}
484455

485-
inline bool BitWriter::PutVlqInt(uint64_t v) {
486-
bool result = true;
487-
while ((v & 0xFFFFFFFFFFFFFF80ULL) != 0ULL) {
488-
result &= PutAligned<uint8_t>(static_cast<uint8_t>((v & 0x7F) | 0x80), 1);
489-
v >>= 7;
456+
template <typename Int>
457+
inline bool BitReader::GetVlqInt(Int* v) {
458+
static_assert(std::is_integral_v<Int>);
459+
460+
// The data that we will pass to the LEB128 parser
461+
// In all case, we read a byte-aligned value, skipping remaining bits
462+
const uint8_t* data = NULLPTR;
463+
int max_size = 0;
464+
465+
// Number of bytes left in the buffered values, not including the current
466+
// byte (i.e., there may be an additional fraction of a byte).
467+
const int bytes_left_in_cache =
468+
sizeof(buffered_values_) - static_cast<int>(bit_util::BytesForBits(bit_offset_));
469+
470+
// If there are clearly enough bytes left we can try to parse from the cache
471+
if (bytes_left_in_cache >= kMaxLEB128ByteLenFor<Int>) {
472+
max_size = bytes_left_in_cache;
473+
data = reinterpret_cast<const uint8_t*>(&buffered_values_) +
474+
bit_util::BytesForBits(bit_offset_);
475+
// Otherwise, we try straight from buffer (ignoring few bytes that may be cached)
476+
} else {
477+
max_size = bytes_left();
478+
data = buffer_ + (max_bytes_ - max_size);
490479
}
491-
result &= PutAligned<uint8_t>(static_cast<uint8_t>(v & 0x7F), 1);
492-
return result;
493-
}
494480

495-
inline bool BitReader::GetVlqInt(uint64_t* v) {
496-
uint64_t tmp = 0;
497-
498-
for (int i = 0; i < kMaxVlqByteLengthForInt64; i++) {
499-
uint8_t byte = 0;
500-
if (ARROW_PREDICT_FALSE(!GetAligned<uint8_t>(1, &byte))) {
501-
return false;
502-
}
503-
tmp |= static_cast<uint64_t>(byte & 0x7F) << (7 * i);
504-
505-
if ((byte & 0x80) == 0) {
506-
*v = tmp;
507-
return true;
508-
}
481+
const auto bytes_read = bit_util::ParseLeadingLEB128(data, max_size, v);
482+
if (ARROW_PREDICT_FALSE(bytes_read == 0)) {
483+
// Corrupt LEB128
484+
return false;
509485
}
510486

511-
return false;
487+
// Advance for the bytes we have read + the bits we skipped
488+
return Advance((8 * bytes_read) + (bit_offset_ % 8));
512489
}
513490

514-
inline bool BitWriter::PutZigZagVlqInt(int64_t v) {
515-
uint64_t u_v = ::arrow::util::SafeCopy<uint64_t>(v);
516-
u_v = (u_v << 1) ^ static_cast<uint64_t>(v >> 63);
491+
template <typename Int>
492+
inline bool BitWriter::PutZigZagVlqInt(Int v) {
493+
static_assert(std::is_integral_v<Int>);
494+
static_assert(std::is_signed_v<Int>);
495+
using UInt = std::make_unsigned_t<Int>;
496+
constexpr auto kBitSize = 8 * sizeof(Int);
497+
498+
UInt u_v = ::arrow::util::SafeCopy<UInt>(v);
499+
u_v = (u_v << 1) ^ static_cast<UInt>(v >> (kBitSize - 1));
517500
return PutVlqInt(u_v);
518501
}
519502

520-
inline bool BitReader::GetZigZagVlqInt(int64_t* v) {
521-
uint64_t u;
503+
template <typename Int>
504+
inline bool BitReader::GetZigZagVlqInt(Int* v) {
505+
static_assert(std::is_integral_v<Int>);
506+
static_assert(std::is_signed_v<Int>);
507+
508+
std::make_unsigned_t<Int> u;
522509
if (!GetVlqInt(&u)) return false;
523510
u = (u >> 1) ^ (~(u & 1) + 1);
524-
*v = ::arrow::util::SafeCopy<int64_t>(u);
511+
*v = ::arrow::util::SafeCopy<Int>(u);
525512
return true;
526513
}
527514

528-
} // namespace bit_util
529-
} // namespace arrow
515+
} // namespace arrow::bit_util

0 commit comments

Comments
 (0)