Skip to content

Commit 2118a20

Browse files
xiaoxmengmeta-codesync[bot]
authored andcommitted
feat: Add prefix encoding support (facebookincubator#421)
Summary: Pull Request resolved: facebookincubator#421 This diff adds a new PrefixEncoding for Nimble that efficiently stores sorted string data with prefix compression. Sorted string data (e.g., dictionary keys, index keys) often has significant common prefixes between consecutive entries. PrefixEncoding exploits this by storing only the shared prefix length and unique suffix for each entry, significantly reducing storage space. Implementation The encoding stores: Shared prefix length (uint32) - bytes shared with previous entry Suffix length (uint32) - length of unique suffix Suffix data - the actual suffix bytes Restart points are full entries (shared_prefix_len = 0) stored at regular intervals (default: 16) to enable efficient seek operations without decoding from the beginning. Binary Layout Standard Encoding prefix (8 bytes) Restart interval (4 bytes) Number of restarts (4 bytes) Encoded entries Restart offsets array (uint32 per restart point) Key Features Prefix compression: Reduces storage for sorted strings with common prefixes Efficient seek: Binary search on restart points + linear scan within block achieves O(log(n/interval) + interval) seek complexity Skip optimization: Leverages restart points to skip across blocks efficiently Buffer management: materializedValues_ buffer keeps string_views valid between materialize() call Followup is to extend encoding API to take encoding options which allows to configure the restart interval Reviewed By: HuamengJiang, tanjialiang, zzhao0 Differential Revision: D90654439 fbshipit-source-id: 77e0368674fcdcdf8120061041510faedd350687
1 parent 191cd4a commit 2118a20

19 files changed

+2387
-81
lines changed

dwio/nimble/common/Types.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ std::string toString(EncodingType encodingType) {
4747
return "MainlyConstant";
4848
case EncodingType::Sentinel:
4949
return "Sentinel";
50+
case EncodingType::Prefix:
51+
return "Prefix";
5052
}
5153
return fmt::format(
5254
"Unknown encoding type: {}", static_cast<int32_t>(encodingType));

dwio/nimble/common/Types.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,10 @@ enum class EncodingType {
100100
// using a bool child vector to store whether each row is that special value,
101101
// and stores the non-special values as a separate encoding.
102102
MainlyConstant = 10,
103+
// Stores sorted string data with prefix compression. Common prefixes are
104+
// shared across consecutive entries to reduce storage. Supports seek
105+
// operations for efficient random access.
106+
Prefix = 11,
103107
};
104108
std::string toString(EncodingType encodingType);
105109
std::ostream& operator<<(std::ostream& out, EncodingType encodingType);

dwio/nimble/common/Vector.h

Lines changed: 68 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -22,31 +22,30 @@
2222
#include <array>
2323
#include <memory>
2424

25-
// Basically equivalent to std::vector, but without the edge case for booleans,
26-
// i.e. data() returns T* for all T. This allows for implicit conversion to
27-
// std::span for all T.
28-
2925
namespace facebook::nimble {
3026

27+
/// A vector-like container similar to std::vector, but without the edge case
28+
/// for booleans. Unlike std::vector<bool>, data() returns T* for all T,
29+
/// allowing implicit conversion to std::span for all T.
3130
template <typename T>
3231
class Vector {
3332
using InnerType =
3433
typename std::conditional<std::is_same_v<T, bool>, uint8_t, T>::type;
3534

3635
public:
37-
Vector(velox::memory::MemoryPool* memoryPool, size_t size, T value)
38-
: memoryPool_{memoryPool} {
36+
/// Constructs a vector with the given size, filled with the specified value.
37+
Vector(velox::memory::MemoryPool* pool, size_t size, T value) : pool_{pool} {
3938
init(size);
4039
std::fill(dataRawPtr_, dataRawPtr_ + size_, value);
4140
}
4241

43-
Vector(velox::memory::MemoryPool* memoryPool, size_t size)
44-
: memoryPool_{memoryPool} {
42+
/// Constructs a vector with the given size, with uninitialized elements.
43+
Vector(velox::memory::MemoryPool* pool, size_t size) : pool_{pool} {
4544
init(size);
4645
}
4746

48-
explicit Vector(velox::memory::MemoryPool* memoryPool)
49-
: memoryPool_{memoryPool} {
47+
/// Constructs an empty vector.
48+
explicit Vector(velox::memory::MemoryPool* pool) : pool_{pool} {
5049
capacity_ = 0;
5150
size_ = 0;
5251
data_ = nullptr;
@@ -56,33 +55,37 @@ class Vector {
5655
#endif
5756
}
5857

58+
/// Constructs a vector from an iterator range.
5959
template <typename It>
60-
Vector(velox::memory::MemoryPool* memoryPool, It first, It last)
61-
: memoryPool_{memoryPool} {
60+
Vector(velox::memory::MemoryPool* pool, It first, It last) : pool_{pool} {
6261
auto size = last - first;
6362
init(size);
6463
std::copy(first, last, dataRawPtr_);
6564
}
6665

66+
/// Copy constructor.
6767
Vector(const Vector& other) {
6868
*this = other;
6969
}
7070

71+
/// Copy assignment operator.
7172
Vector& operator=(const Vector& other) {
7273
if (this != &other) {
7374
size_ = other.size();
7475
capacity_ = other.capacity_;
75-
memoryPool_ = other.memoryPool_;
76+
pool_ = other.pool_;
7677
allocateBuffer();
7778
std::copy(other.dataRawPtr_, other.dataRawPtr_ + size_, dataRawPtr_);
7879
}
7980
return *this;
8081
}
8182

83+
/// Move constructor.
8284
Vector(Vector&& other) noexcept {
8385
*this = std::move(other);
8486
}
8587

88+
/// Move assignment operator.
8689
Vector& operator=(Vector&& other) noexcept {
8790
if (this != &other) {
8891
size_ = other.size();
@@ -94,81 +97,106 @@ class Vector {
9497
if (data_ != nullptr) {
9598
dataRawPtr_ = reinterpret_cast<T*>(data_->asMutable<InnerType>());
9699
}
97-
memoryPool_ = other.memoryPool_;
100+
pool_ = other.pool_;
98101
other.size_ = 0;
99102
other.capacity_ = 0;
100103
}
101104
return *this;
102105
}
103106

104-
Vector(velox::memory::MemoryPool* memoryPool, std::initializer_list<T> l)
105-
: memoryPool_{memoryPool} {
107+
/// Constructs a vector from an initializer list.
108+
Vector(velox::memory::MemoryPool* pool, std::initializer_list<T> l)
109+
: pool_{pool} {
106110
init(l.size());
107111
std::copy(l.begin(), l.end(), dataRawPtr_);
108112
}
109113

110-
inline velox::memory::MemoryPool* memoryPool() {
111-
return memoryPool_;
114+
/// Returns the memory pool used by this vector.
115+
inline velox::memory::MemoryPool* pool() {
116+
return pool_;
112117
}
113118

119+
/// Returns the number of elements in the vector.
114120
uint64_t size() const {
115121
return size_;
116122
}
123+
124+
/// Returns true if the vector is empty.
117125
bool empty() const {
118126
return size_ == 0;
119127
}
128+
129+
/// Returns the current capacity of the vector.
120130
uint64_t capacity() const {
121131
return capacity_;
122132
}
133+
/// Returns a reference to the element at the given index.
123134
T& operator[](uint64_t i) {
124135
return dataRawPtr_[i];
125136
}
137+
138+
/// Returns a const reference to the element at the given index.
126139
const T& operator[](uint64_t i) const {
127140
return dataRawPtr_[i];
128141
}
142+
143+
/// Returns a pointer to the first element.
129144
T* begin() {
130145
return dataRawPtr_;
131146
}
147+
148+
/// Returns a pointer past the last element.
132149
T* end() {
133150
return dataRawPtr_ + size_;
134151
}
152+
153+
/// Returns a const pointer to the first element.
135154
const T* begin() const {
136155
return dataRawPtr_;
137156
}
157+
158+
/// Returns a const pointer past the last element.
138159
const T* end() const {
139160
return dataRawPtr_ + size_;
140161
}
162+
163+
/// Returns a reference to the last element.
141164
T& back() {
142165
return dataRawPtr_[size_ - 1];
143166
}
167+
168+
/// Returns a const reference to the last element.
144169
const T& back() const {
145170
return dataRawPtr_[size_ - 1];
146171
}
147172

148-
// Directly updates the size_ to |size|. Useful if you've filled in some data
149-
// directly using the underlying raw pointers.
173+
/// Directly updates the size to the given value.
174+
/// Useful if you've filled in data directly using the underlying raw
175+
/// pointers.
150176
void update_size(uint64_t size) {
151177
size_ = size;
152178
}
153179

154-
// Fills all of data_ with T().
180+
/// Fills all elements with the default value T().
155181
void zero_out() {
156182
std::fill(dataRawPtr_, dataRawPtr_ + size_, T());
157183
}
158184

159-
// Fills all of data_ with the given value.
185+
/// Fills all elements with the given value.
160186
void fill(T value) {
161187
std::fill(dataRawPtr_, dataRawPtr_ + size_, value);
162188
}
163189

164-
// Resets *this to a newly constructed empty state.
190+
/// Resets the vector to an empty state, releasing allocated memory.
165191
void clear() {
166192
capacity_ = 0;
167193
size_ = 0;
168194
data_.reset();
169195
dataRawPtr_ = nullptr;
170196
}
171197

198+
/// Inserts elements from [inputStart, inputEnd) at the given output
199+
/// position.
172200
void insert(T* output, const T* inputStart, const T* inputEnd) {
173201
const uint64_t inputSize = inputEnd - inputStart;
174202
const uint64_t distanceToEnd = end() - output;
@@ -182,21 +210,24 @@ class Vector {
182210
}
183211
}
184212

185-
// Add |copies| copies of |value| to the end of the vector.
213+
/// Appends the given number of copies of value to the end of the vector.
186214
void extend(uint64_t copies, T value) {
187215
reserve(size_ + copies);
188216
std::fill(end(), end() + copies, value);
189217
size_ += copies;
190218
}
191219

220+
/// Returns a pointer to the underlying data.
192221
T* data() noexcept {
193222
return dataRawPtr_;
194223
}
195224

225+
/// Returns a const pointer to the underlying data.
196226
const T* data() const noexcept {
197227
return dataRawPtr_;
198228
}
199229

230+
/// Appends the given value to the end of the vector.
200231
void push_back(T value) {
201232
if (size_ == capacity_) {
202233
reserve(calculateNewSize(capacity_));
@@ -205,6 +236,7 @@ class Vector {
205236
++size_;
206237
}
207238

239+
/// Constructs an element in-place at the end of the vector.
208240
template <typename... Args>
209241
void emplace_back(Args&&... args) {
210242
if (size_ == capacity_) {
@@ -215,13 +247,13 @@ class Vector {
215247
++size_;
216248
}
217249

218-
// Ensures that *this can hold |size| elements. Does NOT shrink to
219-
// fit if |size| is less than size(), and does NOT initialize any new
220-
// values.
250+
/// Ensures the vector can hold at least the given number of elements.
251+
/// Does NOT shrink if size is less than current size, and does NOT
252+
/// initialize any new elements.
221253
void reserve(uint64_t size) {
222254
if (size > capacity_) {
223255
auto newData =
224-
velox::AlignedBuffer::allocateExact<InnerType>(size, memoryPool_);
256+
velox::AlignedBuffer::allocateExact<InnerType>(size, pool_);
225257
// AlignedBuffer can allocate a bit more than requested for the alignment
226258
// purpose, let's leverage that by using its true capacity.
227259
capacity_ = newData->capacity() / sizeof(InnerType);
@@ -238,15 +270,15 @@ class Vector {
238270
}
239271
}
240272

241-
// Changes size_ to |size|. Does NOT shrink to fit the new size, and
242-
// does NOT initialize any new elements if |size| is greater than size_.
273+
/// Changes the size of the vector.
274+
/// Does NOT shrink capacity, and does NOT initialize new elements.
243275
void resize(uint64_t size) {
244276
reserve(size);
245277
size_ = size;
246278
}
247279

248-
// Changes size_ to |newSize|. Does NOT shrink to fit the new size. Initialize
249-
// any new elements to |value| if |newSize| is greater than size_.
280+
/// Changes the size of the vector, initializing new elements to value.
281+
/// Does NOT shrink capacity.
250282
void resize(uint64_t newSize, const T& value) {
251283
auto initialSize = size_;
252284
resize(newSize);
@@ -256,6 +288,8 @@ class Vector {
256288
}
257289
}
258290

291+
/// Releases ownership of the underlying buffer and returns it.
292+
/// The vector is left in an empty state after this call.
259293
velox::BufferPtr releaseOwnership() {
260294
velox::BufferPtr tmp = std::move(data_);
261295
tmp->setSize(size_);
@@ -286,16 +320,15 @@ class Vector {
286320
}
287321

288322
inline void allocateBuffer() {
289-
data_ =
290-
velox::AlignedBuffer::allocateExact<InnerType>(capacity_, memoryPool_);
323+
data_ = velox::AlignedBuffer::allocateExact<InnerType>(capacity_, pool_);
291324
dataRawPtr_ = reinterpret_cast<T*>(data_->asMutable<InnerType>());
292325
uint64_t newCapacity = data_->capacity() / sizeof(InnerType);
293326
NIMBLE_DCHECK_GE(
294327
newCapacity, capacity_, "Allocated capacity is smaller than requested");
295328
capacity_ = newCapacity;
296329
}
297330

298-
velox::memory::MemoryPool* memoryPool_;
331+
velox::memory::MemoryPool* pool_;
299332
velox::BufferPtr data_;
300333
uint64_t capacity_;
301334
uint64_t size_;

dwio/nimble/encodings/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ add_library(
2323
EncodingLayout.cpp
2424
EncodingLayoutCapture.cpp
2525
MainlyConstantEncoding.cpp
26+
PrefixEncoding.cpp
2627
RleEncoding.cpp
2728
SparseBoolEncoding.cpp
2829
Statistics.cpp

dwio/nimble/encodings/EncodingFactory.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "dwio/nimble/encodings/FixedBitWidthEncoding.h"
2121
#include "dwio/nimble/encodings/MainlyConstantEncoding.h"
2222
#include "dwio/nimble/encodings/NullableEncoding.h"
23+
#include "dwio/nimble/encodings/PrefixEncoding.h"
2324
#include "dwio/nimble/encodings/RleEncoding.h"
2425
#include "dwio/nimble/encodings/SparseBoolEncoding.h"
2526
#include "dwio/nimble/encodings/TrivialEncoding.h"
@@ -229,6 +230,13 @@ std::unique_ptr<Encoding> EncodingFactory::decode(
229230
case EncodingType::MainlyConstant: {
230231
RETURN_ENCODING_BY_NON_BOOL_TYPE(MainlyConstantEncoding, dataType);
231232
}
233+
case EncodingType::Prefix: {
234+
NIMBLE_CHECK_EQ(
235+
dataType,
236+
DataType::String,
237+
"Trying to deserialize a PrefixEncoding with a non-string data type.");
238+
return std::make_unique<PrefixEncoding>(memoryPool, data);
239+
}
232240
default: {
233241
NIMBLE_UNREACHABLE(
234242
"Trying to deserialize invalid EncodingType:{} -- garbage input?",
@@ -336,6 +344,14 @@ std::string_view EncodingFactory::encode(
336344
return SparseBoolEncoding::encode(selection, castedValues, buffer);
337345
}
338346
}
347+
case EncodingType::Prefix: {
348+
if constexpr (!std::is_same<T, std::string_view>::value) {
349+
NIMBLE_INCOMPATIBLE_ENCODING(
350+
"Prefix encoding should only be selected for string_view data types.");
351+
} else {
352+
return PrefixEncoding::encode(selection, castedValues, buffer);
353+
}
354+
}
339355
default: {
340356
NIMBLE_UNSUPPORTED(
341357
"Encoding {} is not supported.", toString(selection.encodingType()));

dwio/nimble/encodings/EncodingLayoutCapture.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ constexpr uint32_t kEncodingPrefixSize = 6;
3030
} // namespace
3131

3232
EncodingLayout EncodingLayoutCapture::capture(std::string_view encoding) {
33-
NIMBLE_CHECK(
34-
encoding.size() >= kEncodingPrefixSize, "Encoding size too small.");
33+
NIMBLE_CHECK_GE(
34+
encoding.size(), kEncodingPrefixSize, "Encoding size too small.");
3535

3636
const auto encodingType =
3737
encoding::peek<uint8_t, EncodingType>(encoding.data());
@@ -47,7 +47,8 @@ EncodingLayout EncodingLayoutCapture::capture(std::string_view encoding) {
4747
switch (encodingType) {
4848
case EncodingType::FixedBitWidth:
4949
case EncodingType::Varint:
50-
case EncodingType::Constant: {
50+
case EncodingType::Constant:
51+
case EncodingType::Prefix: {
5152
// Non nested encodings have zero children
5253
break;
5354
}

dwio/nimble/encodings/EncodingLayoutCapture.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@ namespace facebook::nimble {
2222

2323
class EncodingLayoutCapture {
2424
public:
25-
// Captures an encoding tree from an encoded stream.
26-
// It traverses the encoding headers in the stream and produces a serialized
27-
// encoding tree layout.
28-
// |encoding| - The serialized encoding
25+
/// Captures an encoding tree from an encoded stream.
26+
/// It traverses the encoding headers in the stream and produces a serialized
27+
/// encoding tree layout.
28+
/// |encoding| - The serialized encoding
2929
static EncodingLayout capture(std::string_view encoding);
3030
};
3131

0 commit comments

Comments
 (0)