2222#include < algorithm>
2323#include < cstdint>
2424#include < cstring>
25+ #include < type_traits>
2526
2627#include " arrow/util/bit_util.h"
2728#include " arrow/util/bpacking_internal.h"
3031#include " arrow/util/macros.h"
3132#include " arrow/util/ubsan.h"
3233
33- namespace arrow {
34- namespace bit_util {
34+ namespace arrow ::bit_util {
3535
3636// / Utility class to write bit/byte streams. This class can write data to either be
3737// / bit packed or byte aligned (and a single stream that has a mix of both).
@@ -73,19 +73,14 @@ class BitWriter {
7373 // / room. The value is written byte aligned.
7474 // / For more details on vlq:
7575 // / en.wikipedia.org/wiki/Variable-length_quantity
76- bool PutVlqInt (uint32_t v);
76+ template <typename Int>
77+ bool PutVlqInt (Int v);
7778
78- // Writes an int zigzag encoded.
79- bool PutZigZagVlqInt (int32_t v);
80-
81- // / Write a Vlq encoded int64 to the buffer. Returns false if there was not enough
82- // / room. The value is written byte aligned.
83- // / For more details on vlq:
84- // / en.wikipedia.org/wiki/Variable-length_quantity
85- bool PutVlqInt (uint64_t v);
86-
87- // Writes an int64 zigzag encoded.
88- bool PutZigZagVlqInt (int64_t v);
79+ // / Writes a zigzag encoded signed integer.
80+ // / Zigzag encoding is used to encode possibly negative numbers by alternating positive
81+ // / and negative ones.
82+ template <typename Int>
83+ bool PutZigZagVlqInt (Int v);
8984
9085 // / Get a pointer to the next aligned byte and advance the underlying buffer
9186 // / by num_bytes.
@@ -128,14 +123,14 @@ inline uint64_t ReadLittleEndianWord(const uint8_t* buffer, int bytes_remaining)
128123// / bytes in one read (e.g. encoded int).
129124class BitReader {
130125 public:
131- BitReader () = default ;
126+ BitReader () noexcept = default ;
132127
133128 // / 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'.
134129 BitReader (const uint8_t * buffer, int buffer_len) : BitReader() {
135130 Reset (buffer, buffer_len);
136131 }
137132
138- void Reset (const uint8_t * buffer, int buffer_len) {
133+ void Reset (const uint8_t * buffer, int buffer_len) noexcept {
139134 buffer_ = buffer;
140135 max_bytes_ = buffer_len;
141136 byte_offset_ = 0 ;
@@ -169,18 +164,14 @@ class BitReader {
169164 // / Reads a vlq encoded int from the stream. The encoded int must start at
170165 // / the beginning of a byte. Return false if there were not enough bytes in
171166 // / the buffer.
172- bool GetVlqInt (uint32_t * v);
173-
174- // Reads a zigzag encoded int `into` v.
175- bool GetZigZagVlqInt (int32_t * v);
176-
177- // / Reads a vlq encoded int64 from the stream. The encoded int must start at
178- // / the beginning of a byte. Return false if there were not enough bytes in
179- // / the buffer.
180- bool GetVlqInt (uint64_t * v);
167+ template <typename Int>
168+ bool GetVlqInt (Int* v);
181169
182- // Reads a zigzag encoded int64 `into` v.
183- bool GetZigZagVlqInt (int64_t * v);
170+ // / Reads a zigzag encoded integer into a signed integer output v.
171+ // / Zigzag encoding is used to decode possibly negative numbers by alternating positive
172+ // / and negative ones.
173+ template <typename Int>
174+ bool GetZigZagVlqInt (Int* v);
184175
185176 // / Returns the number of bytes left in the stream, not including the current
186177 // / byte (i.e., there may be an additional fraction of a byte).
@@ -189,12 +180,6 @@ class BitReader {
189180 (byte_offset_ + static_cast <int >(bit_util::BytesForBits (bit_offset_)));
190181 }
191182
192- // / Maximum byte length of a vlq encoded int
193- static constexpr int kMaxVlqByteLength = 5 ;
194-
195- // / Maximum byte length of a vlq encoded int64
196- static constexpr int kMaxVlqByteLengthForInt64 = 10 ;
197-
198183 private:
199184 const uint8_t * buffer_;
200185 int max_bytes_;
@@ -439,91 +424,92 @@ inline bool BitReader::Advance(int64_t num_bits) {
439424 return true ;
440425}
441426
442- inline bool BitWriter::PutVlqInt (uint32_t v) {
443- bool result = true ;
444- while ((v & 0xFFFFFF80UL ) != 0UL ) {
445- result &= PutAligned<uint8_t >(static_cast <uint8_t >((v & 0x7F ) | 0x80 ), 1 );
446- v >>= 7 ;
447- }
448- result &= PutAligned<uint8_t >(static_cast <uint8_t >(v & 0x7F ), 1 );
449- return result;
450- }
427+ template <typename Int>
428+ inline bool BitWriter::PutVlqInt (Int v) {
429+ static_assert (std::is_integral_v<Int>);
451430
452- inline bool BitReader::GetVlqInt (uint32_t * v) {
453- uint32_t tmp = 0 ;
431+ constexpr auto kBufferSize = kMaxLEB128ByteLenFor <Int>;
454432
455- for (int i = 0 ; i < kMaxVlqByteLength ; i++) {
456- uint8_t byte = 0 ;
457- if (ARROW_PREDICT_FALSE (!GetAligned<uint8_t >(1 , &byte))) {
433+ uint8_t buffer[kBufferSize ] = {};
434+ const auto bytes_written = WriteLEB128 (v, buffer, kBufferSize );
435+ ARROW_DCHECK_LE (bytes_written, kBufferSize );
436+ if constexpr (std::is_signed_v<Int>) {
437+ // Can fail if negative
438+ if (ARROW_PREDICT_FALSE (!bytes_written == 0 )) {
458439 return false ;
459440 }
460- tmp |= static_cast <uint32_t >(byte & 0x7F ) << (7 * i);
441+ } else {
442+ // Cannot fail since we gave max space
443+ ARROW_DCHECK_GT (bytes_written, 0 );
444+ }
461445
462- if ((byte & 0x80 ) == 0 ) {
463- *v = tmp;
464- return true ;
446+ for (int i = 0 ; i < bytes_written; ++i) {
447+ const bool success = PutAligned (buffer[i], 1 );
448+ if (ARROW_PREDICT_FALSE (!success)) {
449+ return false ;
465450 }
466451 }
467452
468- return false ;
469- }
470-
471- inline bool BitWriter::PutZigZagVlqInt (int32_t v) {
472- uint32_t u_v = ::arrow::util::SafeCopy<uint32_t >(v);
473- u_v = (u_v << 1 ) ^ static_cast <uint32_t >(v >> 31 );
474- return PutVlqInt (u_v);
475- }
476-
477- inline bool BitReader::GetZigZagVlqInt (int32_t * v) {
478- uint32_t u;
479- if (!GetVlqInt (&u)) return false ;
480- u = (u >> 1 ) ^ (~(u & 1 ) + 1 );
481- *v = ::arrow::util::SafeCopy<int32_t >(u);
482453 return true ;
483454}
484455
485- inline bool BitWriter::PutVlqInt (uint64_t v) {
486- bool result = true ;
487- while ((v & 0xFFFFFFFFFFFFFF80ULL ) != 0ULL ) {
488- result &= PutAligned<uint8_t >(static_cast <uint8_t >((v & 0x7F ) | 0x80 ), 1 );
489- v >>= 7 ;
456+ template <typename Int>
457+ inline bool BitReader::GetVlqInt (Int* v) {
458+ static_assert (std::is_integral_v<Int>);
459+
460+ // The data that we will pass to the LEB128 parser
461+ // In all case, we read a byte-aligned value, skipping remaining bits
462+ const uint8_t * data = NULLPTR;
463+ int max_size = 0 ;
464+
465+ // Number of bytes left in the buffered values, not including the current
466+ // byte (i.e., there may be an additional fraction of a byte).
467+ const int bytes_left_in_cache =
468+ sizeof (buffered_values_) - static_cast <int >(bit_util::BytesForBits (bit_offset_));
469+
470+ // If there are clearly enough bytes left we can try to parse from the cache
471+ if (bytes_left_in_cache >= kMaxLEB128ByteLenFor <Int>) {
472+ max_size = bytes_left_in_cache;
473+ data = reinterpret_cast <const uint8_t *>(&buffered_values_) +
474+ bit_util::BytesForBits (bit_offset_);
475+ // Otherwise, we try straight from buffer (ignoring few bytes that may be cached)
476+ } else {
477+ max_size = bytes_left ();
478+ data = buffer_ + (max_bytes_ - max_size);
490479 }
491- result &= PutAligned<uint8_t >(static_cast <uint8_t >(v & 0x7F ), 1 );
492- return result;
493- }
494480
495- inline bool BitReader::GetVlqInt (uint64_t * v) {
496- uint64_t tmp = 0 ;
497-
498- for (int i = 0 ; i < kMaxVlqByteLengthForInt64 ; i++) {
499- uint8_t byte = 0 ;
500- if (ARROW_PREDICT_FALSE (!GetAligned<uint8_t >(1 , &byte))) {
501- return false ;
502- }
503- tmp |= static_cast <uint64_t >(byte & 0x7F ) << (7 * i);
504-
505- if ((byte & 0x80 ) == 0 ) {
506- *v = tmp;
507- return true ;
508- }
481+ const auto bytes_read = bit_util::ParseLeadingLEB128 (data, max_size, v);
482+ if (ARROW_PREDICT_FALSE (bytes_read == 0 )) {
483+ // Corrupt LEB128
484+ return false ;
509485 }
510486
511- return false ;
487+ // Advance for the bytes we have read + the bits we skipped
488+ return Advance ((8 * bytes_read) + (bit_offset_ % 8 ));
512489}
513490
514- inline bool BitWriter::PutZigZagVlqInt (int64_t v) {
515- uint64_t u_v = ::arrow::util::SafeCopy<uint64_t >(v);
516- u_v = (u_v << 1 ) ^ static_cast <uint64_t >(v >> 63 );
491+ template <typename Int>
492+ inline bool BitWriter::PutZigZagVlqInt (Int v) {
493+ static_assert (std::is_integral_v<Int>);
494+ static_assert (std::is_signed_v<Int>);
495+ using UInt = std::make_unsigned_t <Int>;
496+ constexpr auto kBitSize = 8 * sizeof (Int);
497+
498+ UInt u_v = ::arrow::util::SafeCopy<UInt>(v);
499+ u_v = (u_v << 1 ) ^ static_cast <UInt>(v >> (kBitSize - 1 ));
517500 return PutVlqInt (u_v);
518501}
519502
520- inline bool BitReader::GetZigZagVlqInt (int64_t * v) {
521- uint64_t u;
503+ template <typename Int>
504+ inline bool BitReader::GetZigZagVlqInt (Int* v) {
505+ static_assert (std::is_integral_v<Int>);
506+ static_assert (std::is_signed_v<Int>);
507+
508+ std::make_unsigned_t <Int> u;
522509 if (!GetVlqInt (&u)) return false ;
523510 u = (u >> 1 ) ^ (~(u & 1 ) + 1 );
524- *v = ::arrow::util::SafeCopy<int64_t >(u);
511+ *v = ::arrow::util::SafeCopy<Int >(u);
525512 return true ;
526513}
527514
528- } // namespace bit_util
529- } // namespace arrow
515+ } // namespace arrow::bit_util
0 commit comments