Skip to content

Commit 74840cc

Browse files
sdruzkinfacebook-github-bot
authored andcommitted
Split header and data size in encoding size estimation
Summary: Split header and data size in encoding size estimation so that the read factor can only be applied to the compressible data. Current implementation give funky estimates for small sized data. Differential Revision: D84458948
1 parent 82e6bf3 commit 74840cc

File tree

3 files changed

+86
-54
lines changed

3 files changed

+86
-54
lines changed

dwio/nimble/encodings/EncodingSelectionPolicy.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,10 @@ class ManualEncodingSelectionPolicy : public EncodingSelectionPolicy<T> {
186186
// We use read factor weights to raise/lower the favorability of each
187187
// encoding.
188188
auto readFactor = pair.second;
189-
auto cost = size.value() * readFactor;
189+
auto cost = size.value().cost(readFactor);
190190
NIMBLE_SELECTION_LOG(
191-
YELLOW << "Encoding: " << encodingType << ", Size: " << size.value()
191+
YELLOW << "Encoding: " << encodingType
192+
<< ", Size: " << size.value().size()
192193
<< ", Factor: " << readFactor << ", Cost: " << cost);
193194
if (cost < minCost) {
194195
minCost = cost;

dwio/nimble/encodings/EncodingSizeEstimation.h

Lines changed: 65 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -22,27 +22,40 @@
2222
#include "dwio/nimble/common/Exceptions.h"
2323
#include "dwio/nimble/common/FixedBitArray.h"
2424
#include "dwio/nimble/common/Types.h"
25+
#include "dwio/nimble/encodings/Statistics.h"
2526

2627
namespace facebook::nimble {
2728
namespace detail {
2829

30+
struct SizeEstimation {
31+
uint64_t headerSize;
32+
uint64_t dataSize;
33+
34+
uint64_t cost(double readFactor) const {
35+
return headerSize + dataSize * readFactor;
36+
}
37+
38+
uint64_t size() const {
39+
return headerSize + dataSize;
40+
}
41+
};
42+
2943
// This class is meant to quickly estimate the size of encoded data using a
3044
// given encoding type. It does a lot of assumptions, and it is not meant to be
3145
// 100% accurate.
3246
template <typename T, bool FixedByteWidth>
3347
struct EncodingSizeEstimation {
3448
using physicalType = typename TypeTraits<T>::physicalType;
3549

36-
static std::optional<uint64_t> estimateNumericSize(
50+
static std::optional<SizeEstimation> estimateNumericSize(
3751
const EncodingType encodingType,
3852
const uint64_t entryCount,
3953
const Statistics<physicalType>& statistics) {
4054
switch (encodingType) {
4155
case EncodingType::Constant: {
4256
return statistics.uniqueCounts().size() == 1
43-
? std::optional<uint64_t>{getEncodingOverhead<
44-
EncodingType::Constant,
45-
physicalType>()}
57+
? std::optional<
58+
SizeEstimation>{{getEncodingOverhead<EncodingType::Constant, physicalType>(), 0}}
4659
: std::nullopt;
4760
}
4861
case EncodingType::MainlyConstant: {
@@ -71,25 +84,26 @@ struct EncodingSizeEstimation {
7184
// stored bit packed.
7285
const auto uncommonIndicesSize =
7386
bitPackedBytes(0, entryCount, uncommonCount);
74-
uint32_t overhead =
87+
const uint32_t overhead =
7588
getEncodingOverhead<EncodingType::MainlyConstant, physicalType>() +
7689
// Overhead for storing uncommon values
7790
getEncodingOverhead<EncodingType::FixedBitWidth, physicalType>() +
7891
// Overhead for storing uncommon bitmap
7992
getEncodingOverhead<EncodingType::SparseBool, bool>() +
8093
getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t>();
81-
return overhead + sizeof(physicalType) + uncommonValueSize +
82-
uncommonIndicesSize;
94+
return {
95+
{overhead + sizeof(physicalType),
96+
uncommonValueSize + uncommonIndicesSize}};
8397
}
8498
case EncodingType::Trivial: {
85-
return getEncodingOverhead<EncodingType::Trivial, physicalType>() +
86-
(entryCount * sizeof(physicalType));
99+
return {
100+
{getEncodingOverhead<EncodingType::Trivial, physicalType>(),
101+
entryCount * sizeof(physicalType)}};
87102
}
88103
case EncodingType::FixedBitWidth: {
89-
return getEncodingOverhead<
90-
EncodingType::FixedBitWidth,
91-
physicalType>() +
92-
bitPackedBytes(statistics.min(), statistics.max(), entryCount);
104+
return {
105+
{getEncodingOverhead<EncodingType::FixedBitWidth, physicalType>(),
106+
bitPackedBytes(statistics.min(), statistics.max(), entryCount)}};
93107
}
94108
case EncodingType::Dictionary: {
95109
// Assumptions:
@@ -100,13 +114,13 @@ struct EncodingSizeEstimation {
100114
bitPackedBytes(0, statistics.uniqueCounts().size(), entryCount);
101115
const uint64_t alphabetSize =
102116
statistics.uniqueCounts().size() * sizeof(physicalType);
103-
uint32_t overhead =
117+
const uint32_t overhead =
104118
getEncodingOverhead<EncodingType::Dictionary, physicalType>() +
105119
// Alphabet overhead
106120
getEncodingOverhead<EncodingType::Trivial, physicalType>() +
107121
// Indices overhead
108122
getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t>();
109-
return overhead + alphabetSize + indicesSize;
123+
return {{overhead, alphabetSize + indicesSize}};
110124
}
111125
case EncodingType::RLE: {
112126
// Assumptions:
@@ -122,13 +136,13 @@ struct EncodingSizeEstimation {
122136
statistics.minRepeat(),
123137
statistics.maxRepeat(),
124138
statistics.consecutiveRepeatCount());
125-
uint32_t overhead =
139+
const uint32_t overhead =
126140
getEncodingOverhead<EncodingType::RLE, physicalType>() +
127141
// Overhead of run values
128142
getEncodingOverhead<EncodingType::FixedBitWidth, physicalType>() +
129143
// Overhead of run lengths
130144
getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t>();
131-
return overhead + runValuesSize + runLengthsSize;
145+
return {{overhead, runValuesSize + runLengthsSize}};
132146
}
133147
case EncodingType::Varint: {
134148
// Note: the condition below actually support floating point numbers as
@@ -145,8 +159,9 @@ struct EncodingSizeEstimation {
145159
[&i](const uint64_t sum, const uint64_t bucketSize) {
146160
return sum + (bucketSize * (++i));
147161
});
148-
return getEncodingOverhead<EncodingType::Varint, physicalType>() +
149-
dataSize;
162+
return {
163+
{getEncodingOverhead<EncodingType::Varint, physicalType>(),
164+
dataSize}};
150165
} else {
151166
return std::nullopt;
152167
}
@@ -157,16 +172,15 @@ struct EncodingSizeEstimation {
157172
}
158173
}
159174

160-
static std::optional<uint64_t> estimateBoolSize(
175+
static std::optional<SizeEstimation> estimateBoolSize(
161176
const EncodingType encodingType,
162177
const size_t entryCount,
163178
const Statistics<physicalType>& statistics) {
164179
switch (encodingType) {
165180
case EncodingType::Constant: {
166181
return statistics.uniqueCounts().size() == 1
167-
? std::optional<uint64_t>{getEncodingOverhead<
168-
EncodingType::Constant,
169-
physicalType>()}
182+
? std::optional<
183+
SizeEstimation>{{getEncodingOverhead<EncodingType::Constant, physicalType>(), 0}}
170184
: std::nullopt;
171185
}
172186
case EncodingType::SparseBool: {
@@ -177,16 +191,18 @@ struct EncodingSizeEstimation {
177191
const auto exceptionCount = std::min(
178192
statistics.uniqueCounts().at(true),
179193
statistics.uniqueCounts().at(false));
180-
uint32_t overhead =
194+
const uint32_t overhead =
181195
getEncodingOverhead<EncodingType::SparseBool, physicalType>() +
182196
// Overhead for storing exception indices
183197
getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t>();
184-
return overhead + sizeof(bool) +
185-
bitPackedBytes(0, entryCount, exceptionCount);
198+
return {
199+
{overhead + sizeof(bool),
200+
bitPackedBytes(0, entryCount, exceptionCount)}};
186201
}
187202
case EncodingType::Trivial: {
188-
return getEncodingOverhead<EncodingType::Trivial, physicalType>() +
189-
FixedBitArray::bufferSize(entryCount, 1);
203+
return {
204+
{getEncodingOverhead<EncodingType::Trivial, physicalType>(),
205+
FixedBitArray::bufferSize(entryCount, 1)}};
190206
}
191207
case EncodingType::RLE: {
192208
// Assumptions:
@@ -197,29 +213,28 @@ struct EncodingSizeEstimation {
197213
statistics.minRepeat(),
198214
statistics.maxRepeat(),
199215
statistics.consecutiveRepeatCount());
200-
uint32_t overhead =
216+
const uint32_t overhead =
201217
getEncodingOverhead<EncodingType::RLE, physicalType>() +
202218
// Overhead of run lengths
203219
getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t>();
204-
return overhead + sizeof(bool) + runLengthsSize;
220+
return {{overhead + sizeof(bool), runLengthsSize}};
205221
}
206222
default: {
207223
return std::nullopt;
208224
}
209225
}
210226
}
211227

212-
static std::optional<uint64_t> estimateStringSize(
228+
static std::optional<SizeEstimation> estimateStringSize(
213229
const EncodingType encodingType,
214230
const size_t entryCount,
215231
const Statistics<std::string_view>& statistics) {
216232
const uint32_t maxStringSize = statistics.max().size();
217233
switch (encodingType) {
218234
case EncodingType::Constant: {
219235
return statistics.uniqueCounts().size() == 1
220-
? std::optional<uint64_t>{getEncodingOverhead<
221-
EncodingType::Constant,
222-
physicalType>(maxStringSize)}
236+
? std::optional<
237+
SizeEstimation>{{getEncodingOverhead<EncodingType::Constant, physicalType>(maxStringSize), 0}}
223238
: std::nullopt;
224239
}
225240
case EncodingType::MainlyConstant: {
@@ -264,7 +279,7 @@ struct EncodingSizeEstimation {
264279
// stored bit packed.
265280
const auto uncommonIndicesSize =
266281
bitPackedBytes(0, entryCount, uncommonCount);
267-
uint32_t overhead =
282+
const uint32_t overhead =
268283
getEncodingOverhead<EncodingType::MainlyConstant, physicalType>(
269284
maxUniqueCount->first.size()) +
270285
// Overhead for storing uncommon values
@@ -273,17 +288,18 @@ struct EncodingSizeEstimation {
273288
// Overhead for storing uncommon bitmap
274289
getEncodingOverhead<EncodingType::SparseBool, bool>();
275290

276-
return overhead + alphabetSize + uncommonIndicesSize;
291+
return {{overhead, alphabetSize + uncommonIndicesSize}};
277292
}
278293
case EncodingType::Trivial: {
279294
// We assume string lengths will be stored bit packed.
280-
return getEncodingOverhead<EncodingType::Trivial, physicalType>(
281-
maxStringSize) +
282-
statistics.totalStringsLength() +
283-
bitPackedBytes(
284-
statistics.min().size(),
285-
statistics.max().size(),
286-
entryCount);
295+
return {
296+
{getEncodingOverhead<EncodingType::Trivial, physicalType>(
297+
maxStringSize),
298+
statistics.totalStringsLength() +
299+
bitPackedBytes(
300+
statistics.min().size(),
301+
statistics.max().size(),
302+
entryCount)}};
287303
}
288304
case EncodingType::Dictionary: {
289305
// Assumptions:
@@ -305,23 +321,23 @@ struct EncodingSizeEstimation {
305321
bitPackedBytes(statistics.min().size(),
306322
statistics.max().size(),
307323
statistics.uniqueCounts().size());
308-
uint32_t overhead =
324+
const uint32_t overhead =
309325
getEncodingOverhead<EncodingType::Dictionary, physicalType>(
310326
maxStringSize) +
311327
// Alphabet overhead
312328
getEncodingOverhead<EncodingType::Trivial, physicalType>(
313329
maxStringSize) +
314330
// Indices overhead
315331
getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t>();
316-
return overhead + alphabetSize + indicesSize;
332+
return {{overhead, alphabetSize + indicesSize}};
317333
}
318334
case EncodingType::RLE: {
319335
// Assumptions:
320336
// Run values are stored using dictionary (and inside, trivial +
321337
// bit-packing). Run lengths are stored using bit-packing (with bit
322338
// width needed to store max repetition count).
323339

324-
uint64_t runValuesSize =
340+
const uint64_t runValuesSize =
325341
// (unique) strings blob size
326342
std::accumulate(
327343
statistics.uniqueCounts().cbegin(),
@@ -344,23 +360,23 @@ struct EncodingSizeEstimation {
344360
statistics.minRepeat(),
345361
statistics.maxRepeat(),
346362
statistics.consecutiveRepeatCount());
347-
uint32_t overhead =
363+
const uint32_t overhead =
348364
getEncodingOverhead<EncodingType::RLE, physicalType>() +
349365
// Overhead of run values
350366
getEncodingOverhead<EncodingType::Dictionary, physicalType>() +
351367
getEncodingOverhead<EncodingType::Trivial, physicalType>() +
352368
getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t>() +
353369
// Overhead of run lengths
354370
getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t>();
355-
return overhead + runValuesSize + runLengthsSize;
371+
return {{overhead, runValuesSize + runLengthsSize}};
356372
}
357373
default: {
358374
return std::nullopt;
359375
}
360376
}
361377
}
362378

363-
static std::optional<uint64_t> estimateSize(
379+
static std::optional<SizeEstimation> estimateSize(
364380
const EncodingType encodingType,
365381
const size_t entryCount,
366382
const Statistics<physicalType>& statistics) {

dwio/nimble/encodings/tests/EncodingSelectionTests.cpp

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ void verifySizeEstimate(
134134
encodingTypeForEstimation,
135135
values.size(),
136136
nimble::Statistics<T>::create(values));
137-
EXPECT_EQ(estimatedSize, expectedEstimatedSize);
137+
EXPECT_EQ(estimatedSize.value().size(), expectedEstimatedSize);
138138
}
139139

140140
template <typename T>
@@ -180,7 +180,7 @@ void test(std::span<const T> values, std::vector<EncodingDetails> expected) {
180180
LOG(INFO) << "Expected: " << expected[i].encodingType << "<"
181181
<< expected[i].dataType << ">[" << expected[i].nestedEncodingName
182182
<< ":" << expected[i].level << "]";
183-
LOG(INFO) << "Actual: " << actual[i].encodingType << "<"
183+
LOG(INFO) << "Actual: " << actual[i].encodingType << "<"
184184
<< actual[i].dataType << ">[" << actual[i].nestedEncodingName
185185
<< ":" << actual[i].level << "]";
186186
EXPECT_EQ(expected[i].encodingType, actual[i].encodingType);
@@ -435,7 +435,7 @@ TYPED_TEST(EncodingSelectionNumericTests, SelectRunLength) {
435435

436436
if constexpr (
437437
nimble::isFloatingPointType<T>() || std::is_same_v<int32_t, T> ||
438-
sizeof(T) > 4) {
438+
sizeof(T) >= 4) {
439439
// Floating point types and big types prefer storing the run values as
440440
// dictionary
441441
test<T>(
@@ -994,3 +994,18 @@ TEST(EncodingSelectionTests, TestNullable) {
994994

995995
LOG(INFO) << "Final size: " << serialized.size();
996996
}
997+
998+
TEST(EncodingSelectionTests, TestSizeEstimateCost) {
999+
std::vector<uint8_t> values{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
1000+
auto estimatedSize =
1001+
nimble::detail::EncodingSizeEstimation<uint8_t, false>::estimateSize(
1002+
nimble::EncodingType::Trivial,
1003+
values.size(),
1004+
nimble::Statistics<uint8_t>::create(values))
1005+
.value();
1006+
EXPECT_EQ(estimatedSize.headerSize, 7);
1007+
EXPECT_EQ(estimatedSize.dataSize, 10);
1008+
EXPECT_EQ(estimatedSize.size(), 17);
1009+
EXPECT_EQ(estimatedSize.cost(10), 107);
1010+
EXPECT_EQ(estimatedSize.cost(0), 7);
1011+
}

0 commit comments

Comments
 (0)