fork-apache-arrow/cpp/src/arrow/util/bit_stream_utils_internal.h at 53453c8f4f71112a5013eb14c177143aef19dfc8 · AntoinePrv/fork-apache-arrow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

// From Apache Impala (incubating) as of 2016-01-29

#pragma once

#include <cstdint>
#include <cstring>
#include <type_traits>

#include "arrow/util/bit_util.h"
#include "arrow/util/bpacking_internal.h"
#include "arrow/util/endian.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/ubsan.h"

namespace arrow::bit_util {

/// Utility class to write bit/byte streams.  This class can write data to either be
/// bit packed or byte aligned (and a single stream that has a mix of both).
/// This class does not allocate memory.
class BitWriter {
 public:
  /// buffer: buffer to write bits to.  Buffer should be preallocated with
  /// 'buffer_len' bytes.
  BitWriter(uint8_t* buffer, int buffer_len) : buffer_(buffer), max_bytes_(buffer_len) {
    Clear();
  }

  void Clear() {
    buffered_values_ = 0;
    byte_offset_ = 0;
    bit_offset_ = 0;
  }

  /// The number of current bytes written, including the current byte (i.e. may include a
  /// fraction of a byte). Includes buffered values.
  int bytes_written() const {
    return byte_offset_ + static_cast<int>(bit_util::BytesForBits(bit_offset_));
  }
  uint8_t* buffer() const { return buffer_; }
  int buffer_len() const { return max_bytes_; }

  /// Writes a value to buffered_values_, flushing to buffer_ if necessary.  This is bit
  /// packed.  Returns false if there was not enough space. num_bits must be <= 32.
  bool PutValue(uint64_t v, int num_bits);

  /// Writes v to the next aligned byte using num_bytes. If T is larger than
  /// num_bytes, the extra high-order bytes will be ignored. Returns false if
  /// there was not enough space.
  /// Assume the v is stored in buffer_ as a little-endian format
  template <typename T>
  bool PutAligned(T v, int num_bytes);

  /// Write a Vlq encoded int to the buffer.  Returns false if there was not enough
  /// room.  The value is written byte aligned.
  /// For more details on vlq:
  /// en.wikipedia.org/wiki/Variable-length_quantity
  template <typename Int>
  bool PutVlqInt(Int v);

  /// Writes a zigzag encoded signed integer.
  /// Zigzag encoding is used to encode possibly negative numbers by alternating positive
  /// and negative ones.
  template <typename Int>
  bool PutZigZagVlqInt(Int v);

  /// Get a pointer to the next aligned byte and advance the underlying buffer
  /// by num_bytes.
  /// Returns NULL if there was not enough space.
  uint8_t* GetNextBytePtr(int num_bytes = 1);

  /// Flushes all buffered values to the buffer. Call this when done writing to
  /// the buffer.  If 'align' is true, buffered_values_ is reset and any future
  /// writes will be written to the next byte boundary.
  void Flush(bool align = false);

 private:
  uint8_t* buffer_;
  int max_bytes_;

  /// Bit-packed values are initially written to this variable before being memcpy'd to
  /// buffer_. This is faster than writing values byte by byte directly to buffer_.
  uint64_t buffered_values_;

  int byte_offset_;  // Offset in buffer_
  int bit_offset_;   // Offset in buffered_values_
};

namespace detail {

inline uint64_t ReadLittleEndianWord(const uint8_t* buffer, int bytes_remaining) {
  uint64_t le_value = 0;
  if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) {
    memcpy(&le_value, buffer, 8);
  } else {
    memcpy(&le_value, buffer, bytes_remaining);
  }
  return arrow::bit_util::FromLittleEndian(le_value);
}

}  // namespace detail

/// Utility class to read bit/byte stream.  This class can read bits or bytes
/// that are either byte aligned or not.  It also has utilities to read multiple
/// bytes in one read (e.g. encoded int).
class BitReader {
 public:
  BitReader() noexcept = default;

  /// 'buffer' is the buffer to read from.  The buffer's length is 'buffer_len'.
  BitReader(const uint8_t* buffer, int buffer_len) : BitReader() {
    Reset(buffer, buffer_len);
  }

  void Reset(const uint8_t* buffer, int buffer_len) noexcept {
    buffer_ = buffer;
    max_bytes_ = buffer_len;
    byte_offset_ = 0;
    bit_offset_ = 0;
    buffered_values_ =
        detail::ReadLittleEndianWord(buffer_ + byte_offset_, max_bytes_ - byte_offset_);
  }

  /// Gets the next value from the buffer.  Returns true if 'v' could be read or false if
  /// there are not enough bytes left.
  template <typename T>
  bool GetValue(int num_bits, T* v);

  /// Get a number of values from the buffer. Return the number of values actually read.
  template <typename T>
  int GetBatch(int num_bits, T* v, int batch_size);

  /// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T
  /// needs to be a little-endian native type and big enough to store
  /// 'num_bytes'. The value is assumed to be byte-aligned so the stream will
  /// be advanced to the start of the next byte before 'v' is read. Returns
  /// false if there are not enough bytes left.
  /// Assume the v was stored in buffer_ as a little-endian format
  template <typename T>
  bool GetAligned(int num_bytes, T* v);

  /// Advances the stream by a number of bits. Returns true if succeed or false if there
  /// are not enough bits left.
  bool Advance(int64_t num_bits);

  /// Reads a vlq encoded int from the stream.  The encoded int must start at
  /// the beginning of a byte. Return false if there were not enough bytes in
  /// the buffer.
  template <typename Int>
  bool GetVlqInt(Int* v);

  /// Reads a zigzag encoded integer into a signed integer output v.
  /// Zigzag encoding is used to decode possibly negative numbers by alternating positive
  /// and negative ones.
  template <typename Int>
  bool GetZigZagVlqInt(Int* v);

  /// Returns the number of bytes left in the stream, not including the current
  /// byte (i.e., there may be an additional fraction of a byte).
  int bytes_left() const {
    return max_bytes_ -
           (byte_offset_ + static_cast<int>(bit_util::BytesForBits(bit_offset_)));
  }

 private:
  const uint8_t* buffer_;
  int max_bytes_;

  /// Bytes are memcpy'd from buffer_ and values are read from this variable. This is
  /// faster than reading values byte by byte directly from buffer_.
  uint64_t buffered_values_;

  int byte_offset_;  // Offset in buffer_
  int bit_offset_;   // Offset in buffered_values_
};

inline bool BitWriter::PutValue(uint64_t v, int num_bits) {
  ARROW_DCHECK_LE(num_bits, 64);
  if (num_bits < 64) {
    ARROW_DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits;
  }

  if (ARROW_PREDICT_FALSE(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8))
    return false;

  buffered_values_ |= v << bit_offset_;
  bit_offset_ += num_bits;

  if (ARROW_PREDICT_FALSE(bit_offset_ >= 64)) {
    // Flush buffered_values_ and write out bits of v that did not fit
    buffered_values_ = arrow::bit_util::ToLittleEndian(buffered_values_);
    memcpy(buffer_ + byte_offset_, &buffered_values_, 8);
    buffered_values_ = 0;
    byte_offset_ += 8;
    bit_offset_ -= 64;
    buffered_values_ =
        (num_bits - bit_offset_ == 64) ? 0 : (v >> (num_bits - bit_offset_));
  }
  ARROW_DCHECK_LT(bit_offset_, 64);
  return true;
}

inline void BitWriter::Flush(bool align) {
  int num_bytes = static_cast<int>(bit_util::BytesForBits(bit_offset_));
  ARROW_DCHECK_LE(byte_offset_ + num_bytes, max_bytes_);
  auto buffered_values = arrow::bit_util::ToLittleEndian(buffered_values_);
  memcpy(buffer_ + byte_offset_, &buffered_values, num_bytes);

  if (align) {
    buffered_values_ = 0;
    byte_offset_ += num_bytes;
    bit_offset_ = 0;
  }
}

inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) {
  Flush(/* align */ true);
  ARROW_DCHECK_LE(byte_offset_, max_bytes_);
  if (byte_offset_ + num_bytes > max_bytes_) return NULL;
  uint8_t* ptr = buffer_ + byte_offset_;
  byte_offset_ += num_bytes;
  return ptr;
}

template <typename T>
inline bool BitWriter::PutAligned(T val, int num_bytes) {
  uint8_t* ptr = GetNextBytePtr(num_bytes);
  if (ptr == NULL) return false;
  val = arrow::bit_util::ToLittleEndian(val);
  memcpy(ptr, &val, num_bytes);
  return true;
}

template <typename T>
inline bool BitReader::GetValue(int num_bits, T* v) {
  return GetBatch(num_bits, v, 1) == 1;
}

template <typename T>
inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
  constexpr uint64_t kBitsPerByte = 8;

  ARROW_DCHECK(buffer_ != NULLPTR);
  ARROW_DCHECK_LE(num_bits, static_cast<int>(sizeof(T) * 8)) << "num_bits: " << num_bits;

  const int64_t needed_bits = num_bits * static_cast<int64_t>(batch_size);
  const int64_t remaining_bits =
      static_cast<int64_t>(max_bytes_ - byte_offset_) * kBitsPerByte - bit_offset_;
  if (remaining_bits < needed_bits) {
    batch_size = static_cast<int>(remaining_bits / num_bits);
  }

  if constexpr (std::is_same_v<T, bool>) {
    ::arrow::internal::unpack(buffer_ + byte_offset_, v, batch_size, num_bits,
                              bit_offset_);

  } else {
    ::arrow::internal::unpack(buffer_ + byte_offset_,
                              reinterpret_cast<std::make_unsigned_t<T>*>(v), batch_size,
                              num_bits, bit_offset_);
  }

  Advance(batch_size * num_bits);

  return batch_size;
}

template <typename T>
inline bool BitReader::GetAligned(int num_bytes, T* v) {
  if (ARROW_PREDICT_FALSE(num_bytes > static_cast<int>(sizeof(T)))) {
    return false;
  }

  int bytes_read = static_cast<int>(bit_util::BytesForBits(bit_offset_));
  if (ARROW_PREDICT_FALSE(byte_offset_ + bytes_read + num_bytes > max_bytes_)) {
    return false;
  }

  // Advance byte_offset to next unread byte and read num_bytes
  byte_offset_ += bytes_read;
  if constexpr (std::is_same_v<T, bool>) {
    // ARROW-18031: if we're trying to get an aligned bool, just check
    // the LSB of the next byte and move on. If we memcpy + FromLittleEndian
    // as usual, we have potential undefined behavior for bools if the value
    // isn't 0 or 1
    *v = *(buffer_ + byte_offset_) & 1;
  } else {
    memcpy(v, buffer_ + byte_offset_, num_bytes);
    *v = arrow::bit_util::FromLittleEndian(*v);
  }
  byte_offset_ += num_bytes;

  bit_offset_ = 0;
  buffered_values_ =
      detail::ReadLittleEndianWord(buffer_ + byte_offset_, max_bytes_ - byte_offset_);
  return true;
}

inline bool BitReader::Advance(int64_t num_bits) {
  int64_t bits_required = bit_offset_ + num_bits;
  int64_t bytes_required = bit_util::BytesForBits(bits_required);
  if (ARROW_PREDICT_FALSE(bytes_required > max_bytes_ - byte_offset_)) {
    return false;
  }
  byte_offset_ += static_cast<int>(bits_required >> 3);
  bit_offset_ = static_cast<int>(bits_required & 7);
  buffered_values_ =
      detail::ReadLittleEndianWord(buffer_ + byte_offset_, max_bytes_ - byte_offset_);
  return true;
}

template <typename Int>
inline bool BitWriter::PutVlqInt(Int v) {
  static_assert(std::is_integral_v<Int>);

  constexpr auto kBufferSize = kMaxLEB128ByteLenFor<Int>;

  uint8_t buffer[kBufferSize] = {};
  const auto bytes_written = WriteLEB128(v, buffer, kBufferSize);
  ARROW_DCHECK_LE(bytes_written, kBufferSize);
  if constexpr (std::is_signed_v<Int>) {
    // Can fail if negative
    if (ARROW_PREDICT_FALSE(!bytes_written == 0)) {
      return false;
    }
  } else {
    // Cannot fail since we gave max space
    ARROW_DCHECK_GT(bytes_written, 0);
  }

  for (int i = 0; i < bytes_written; ++i) {
    const bool success = PutAligned(buffer[i], 1);
    if (ARROW_PREDICT_FALSE(!success)) {
      return false;
    }
  }

  return true;
}

template <typename Int>
inline bool BitReader::GetVlqInt(Int* v) {
  static_assert(std::is_integral_v<Int>);

  // The data that we will pass to the LEB128 parser
  // In all case, we read a byte-aligned value, skipping remaining bits
  const uint8_t* data = NULLPTR;
  int max_size = 0;

  // Number of bytes left in the buffered values, not including the current
  // byte (i.e., there may be an additional fraction of a byte).
  const int bytes_left_in_cache =
      sizeof(buffered_values_) - static_cast<int>(bit_util::BytesForBits(bit_offset_));

  // If there are clearly enough bytes left we can try to parse from the cache
  if (bytes_left_in_cache >= kMaxLEB128ByteLenFor<Int>) {
    max_size = bytes_left_in_cache;
    data = reinterpret_cast<const uint8_t*>(&buffered_values_) +
           bit_util::BytesForBits(bit_offset_);
    // Otherwise, we try straight from buffer (ignoring few bytes that may be cached)
  } else {
    max_size = bytes_left();
    data = buffer_ + (max_bytes_ - max_size);
  }

  const auto bytes_read = bit_util::ParseLeadingLEB128(data, max_size, v);
  if (ARROW_PREDICT_FALSE(bytes_read == 0)) {
    // Corrupt LEB128
    return false;
  }

  // Advance for the bytes we have read + the bits we skipped
  return Advance((8 * bytes_read) + (bit_offset_ % 8));
}

template <typename Int>
inline bool BitWriter::PutZigZagVlqInt(Int v) {
  static_assert(std::is_integral_v<Int>);
  static_assert(std::is_signed_v<Int>);
  using UInt = std::make_unsigned_t<Int>;
  constexpr auto kBitSize = 8 * sizeof(Int);

  UInt u_v = ::arrow::util::SafeCopy<UInt>(v);
  u_v = (u_v << 1) ^ static_cast<UInt>(v >> (kBitSize - 1));
  return PutVlqInt(u_v);
}

template <typename Int>
inline bool BitReader::GetZigZagVlqInt(Int* v) {
  static_assert(std::is_integral_v<Int>);
  static_assert(std::is_signed_v<Int>);

  std::make_unsigned_t<Int> u;
  if (!GetVlqInt(&u)) return false;
  u = (u >> 1) ^ (~(u & 1) + 1);
  *v = ::arrow::util::SafeCopy<Int>(u);
  return true;
}

}  // namespace arrow::bit_util