2222#include " dwio/nimble/common/Exceptions.h"
2323#include " dwio/nimble/common/FixedBitArray.h"
2424#include " dwio/nimble/common/Types.h"
25+ #include " dwio/nimble/encodings/Statistics.h"
2526
2627namespace facebook ::nimble {
2728namespace detail {
2829
30+ struct SizeEstimation {
31+ uint64_t headerSize;
32+ uint64_t dataSize;
33+
34+ uint64_t cost (double readFactor) const {
35+ return headerSize + dataSize * readFactor;
36+ }
37+
38+ uint64_t size () const {
39+ return headerSize + dataSize;
40+ }
41+ };
42+
2943// This class is meant to quickly estimate the size of encoded data using a
3044// given encoding type. It does a lot of assumptions, and it is not meant to be
3145// 100% accurate.
3246template <typename T, bool FixedByteWidth>
3347struct EncodingSizeEstimation {
3448 using physicalType = typename TypeTraits<T>::physicalType;
3549
36- static std::optional<uint64_t > estimateNumericSize (
50+ static std::optional<SizeEstimation > estimateNumericSize (
3751 const EncodingType encodingType,
3852 const uint64_t entryCount,
3953 const Statistics<physicalType>& statistics) {
4054 switch (encodingType) {
4155 case EncodingType::Constant: {
4256 return statistics.uniqueCounts ().size () == 1
43- ? std::optional<uint64_t >{getEncodingOverhead<
44- EncodingType::Constant,
45- physicalType>()}
57+ ? std::optional<
58+ SizeEstimation>{{getEncodingOverhead<EncodingType::Constant, physicalType>(), 0 }}
4659 : std::nullopt ;
4760 }
4861 case EncodingType::MainlyConstant: {
@@ -71,25 +84,26 @@ struct EncodingSizeEstimation {
7184 // stored bit packed.
7285 const auto uncommonIndicesSize =
7386 bitPackedBytes (0 , entryCount, uncommonCount);
74- uint32_t overhead =
87+ const uint32_t overhead =
7588 getEncodingOverhead<EncodingType::MainlyConstant, physicalType>() +
7689 // Overhead for storing uncommon values
7790 getEncodingOverhead<EncodingType::FixedBitWidth, physicalType>() +
7891 // Overhead for storing uncommon bitmap
7992 getEncodingOverhead<EncodingType::SparseBool, bool >() +
8093 getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t >();
81- return overhead + sizeof (physicalType) + uncommonValueSize +
82- uncommonIndicesSize;
94+ return {
95+ {overhead + sizeof (physicalType),
96+ uncommonValueSize + uncommonIndicesSize}};
8397 }
8498 case EncodingType::Trivial: {
85- return getEncodingOverhead<EncodingType::Trivial, physicalType>() +
86- (entryCount * sizeof (physicalType));
99+ return {
100+ {getEncodingOverhead<EncodingType::Trivial, physicalType>(),
101+ entryCount * sizeof (physicalType)}};
87102 }
88103 case EncodingType::FixedBitWidth: {
89- return getEncodingOverhead<
90- EncodingType::FixedBitWidth,
91- physicalType>() +
92- bitPackedBytes (statistics.min (), statistics.max (), entryCount);
104+ return {
105+ {getEncodingOverhead<EncodingType::FixedBitWidth, physicalType>(),
106+ bitPackedBytes (statistics.min (), statistics.max (), entryCount)}};
93107 }
94108 case EncodingType::Dictionary: {
95109 // Assumptions:
@@ -100,13 +114,13 @@ struct EncodingSizeEstimation {
100114 bitPackedBytes (0 , statistics.uniqueCounts ().size (), entryCount);
101115 const uint64_t alphabetSize =
102116 statistics.uniqueCounts ().size () * sizeof (physicalType);
103- uint32_t overhead =
117+ const uint32_t overhead =
104118 getEncodingOverhead<EncodingType::Dictionary, physicalType>() +
105119 // Alphabet overhead
106120 getEncodingOverhead<EncodingType::Trivial, physicalType>() +
107121 // Indices overhead
108122 getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t >();
109- return overhead + alphabetSize + indicesSize;
123+ return {{ overhead, alphabetSize + indicesSize}} ;
110124 }
111125 case EncodingType::RLE: {
112126 // Assumptions:
@@ -122,13 +136,13 @@ struct EncodingSizeEstimation {
122136 statistics.minRepeat (),
123137 statistics.maxRepeat (),
124138 statistics.consecutiveRepeatCount ());
125- uint32_t overhead =
139+ const uint32_t overhead =
126140 getEncodingOverhead<EncodingType::RLE, physicalType>() +
127141 // Overhead of run values
128142 getEncodingOverhead<EncodingType::FixedBitWidth, physicalType>() +
129143 // Overhead of run lengths
130144 getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t >();
131- return overhead + runValuesSize + runLengthsSize;
145+ return {{ overhead, runValuesSize + runLengthsSize}} ;
132146 }
133147 case EncodingType::Varint: {
134148 // Note: the condition below actually support floating point numbers as
@@ -145,8 +159,9 @@ struct EncodingSizeEstimation {
145159 [&i](const uint64_t sum, const uint64_t bucketSize) {
146160 return sum + (bucketSize * (++i));
147161 });
148- return getEncodingOverhead<EncodingType::Varint, physicalType>() +
149- dataSize;
162+ return {
163+ {getEncodingOverhead<EncodingType::Varint, physicalType>(),
164+ dataSize}};
150165 } else {
151166 return std::nullopt ;
152167 }
@@ -157,16 +172,15 @@ struct EncodingSizeEstimation {
157172 }
158173 }
159174
160- static std::optional<uint64_t > estimateBoolSize (
175+ static std::optional<SizeEstimation > estimateBoolSize (
161176 const EncodingType encodingType,
162177 const size_t entryCount,
163178 const Statistics<physicalType>& statistics) {
164179 switch (encodingType) {
165180 case EncodingType::Constant: {
166181 return statistics.uniqueCounts ().size () == 1
167- ? std::optional<uint64_t >{getEncodingOverhead<
168- EncodingType::Constant,
169- physicalType>()}
182+ ? std::optional<
183+ SizeEstimation>{{getEncodingOverhead<EncodingType::Constant, physicalType>(), 0 }}
170184 : std::nullopt ;
171185 }
172186 case EncodingType::SparseBool: {
@@ -177,16 +191,18 @@ struct EncodingSizeEstimation {
177191 const auto exceptionCount = std::min (
178192 statistics.uniqueCounts ().at (true ),
179193 statistics.uniqueCounts ().at (false ));
180- uint32_t overhead =
194+ const uint32_t overhead =
181195 getEncodingOverhead<EncodingType::SparseBool, physicalType>() +
182196 // Overhead for storing exception indices
183197 getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t >();
184- return overhead + sizeof (bool ) +
185- bitPackedBytes (0 , entryCount, exceptionCount);
198+ return {
199+ {overhead + sizeof (bool ),
200+ bitPackedBytes (0 , entryCount, exceptionCount)}};
186201 }
187202 case EncodingType::Trivial: {
188- return getEncodingOverhead<EncodingType::Trivial, physicalType>() +
189- FixedBitArray::bufferSize (entryCount, 1 );
203+ return {
204+ {getEncodingOverhead<EncodingType::Trivial, physicalType>(),
205+ FixedBitArray::bufferSize (entryCount, 1 )}};
190206 }
191207 case EncodingType::RLE: {
192208 // Assumptions:
@@ -197,29 +213,28 @@ struct EncodingSizeEstimation {
197213 statistics.minRepeat (),
198214 statistics.maxRepeat (),
199215 statistics.consecutiveRepeatCount ());
200- uint32_t overhead =
216+ const uint32_t overhead =
201217 getEncodingOverhead<EncodingType::RLE, physicalType>() +
202218 // Overhead of run lengths
203219 getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t >();
204- return overhead + sizeof (bool ) + runLengthsSize;
220+ return {{ overhead + sizeof (bool ), runLengthsSize}} ;
205221 }
206222 default : {
207223 return std::nullopt ;
208224 }
209225 }
210226 }
211227
212- static std::optional<uint64_t > estimateStringSize (
228+ static std::optional<SizeEstimation > estimateStringSize (
213229 const EncodingType encodingType,
214230 const size_t entryCount,
215231 const Statistics<std::string_view>& statistics) {
216232 const uint32_t maxStringSize = statistics.max ().size ();
217233 switch (encodingType) {
218234 case EncodingType::Constant: {
219235 return statistics.uniqueCounts ().size () == 1
220- ? std::optional<uint64_t >{getEncodingOverhead<
221- EncodingType::Constant,
222- physicalType>(maxStringSize)}
236+ ? std::optional<
237+ SizeEstimation>{{getEncodingOverhead<EncodingType::Constant, physicalType>(maxStringSize), 0 }}
223238 : std::nullopt ;
224239 }
225240 case EncodingType::MainlyConstant: {
@@ -264,7 +279,7 @@ struct EncodingSizeEstimation {
264279 // stored bit packed.
265280 const auto uncommonIndicesSize =
266281 bitPackedBytes (0 , entryCount, uncommonCount);
267- uint32_t overhead =
282+ const uint32_t overhead =
268283 getEncodingOverhead<EncodingType::MainlyConstant, physicalType>(
269284 maxUniqueCount->first .size ()) +
270285 // Overhead for storing uncommon values
@@ -273,17 +288,18 @@ struct EncodingSizeEstimation {
273288 // Overhead for storing uncommon bitmap
274289 getEncodingOverhead<EncodingType::SparseBool, bool >();
275290
276- return overhead + alphabetSize + uncommonIndicesSize;
291+ return {{ overhead, alphabetSize + uncommonIndicesSize}} ;
277292 }
278293 case EncodingType::Trivial: {
279294 // We assume string lengths will be stored bit packed.
280- return getEncodingOverhead<EncodingType::Trivial, physicalType>(
281- maxStringSize) +
282- statistics.totalStringsLength () +
283- bitPackedBytes (
284- statistics.min ().size (),
285- statistics.max ().size (),
286- entryCount);
295+ return {
296+ {getEncodingOverhead<EncodingType::Trivial, physicalType>(
297+ maxStringSize),
298+ statistics.totalStringsLength () +
299+ bitPackedBytes (
300+ statistics.min ().size (),
301+ statistics.max ().size (),
302+ entryCount)}};
287303 }
288304 case EncodingType::Dictionary: {
289305 // Assumptions:
@@ -305,23 +321,23 @@ struct EncodingSizeEstimation {
305321 bitPackedBytes (statistics.min ().size (),
306322 statistics.max ().size (),
307323 statistics.uniqueCounts ().size ());
308- uint32_t overhead =
324+ const uint32_t overhead =
309325 getEncodingOverhead<EncodingType::Dictionary, physicalType>(
310326 maxStringSize) +
311327 // Alphabet overhead
312328 getEncodingOverhead<EncodingType::Trivial, physicalType>(
313329 maxStringSize) +
314330 // Indices overhead
315331 getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t >();
316- return overhead + alphabetSize + indicesSize;
332+ return {{ overhead, alphabetSize + indicesSize}} ;
317333 }
318334 case EncodingType::RLE: {
319335 // Assumptions:
320336 // Run values are stored using dictionary (and inside, trivial +
321337 // bit-packing). Run lengths are stored using bit-packing (with bit
322338 // width needed to store max repetition count).
323339
324- uint64_t runValuesSize =
340+ const uint64_t runValuesSize =
325341 // (unique) strings blob size
326342 std::accumulate (
327343 statistics.uniqueCounts ().cbegin (),
@@ -344,23 +360,23 @@ struct EncodingSizeEstimation {
344360 statistics.minRepeat (),
345361 statistics.maxRepeat (),
346362 statistics.consecutiveRepeatCount ());
347- uint32_t overhead =
363+ const uint32_t overhead =
348364 getEncodingOverhead<EncodingType::RLE, physicalType>() +
349365 // Overhead of run values
350366 getEncodingOverhead<EncodingType::Dictionary, physicalType>() +
351367 getEncodingOverhead<EncodingType::Trivial, physicalType>() +
352368 getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t >() +
353369 // Overhead of run lengths
354370 getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t >();
355- return overhead + runValuesSize + runLengthsSize;
371+ return {{ overhead, runValuesSize + runLengthsSize}} ;
356372 }
357373 default : {
358374 return std::nullopt ;
359375 }
360376 }
361377 }
362378
363- static std::optional<uint64_t > estimateSize (
379+ static std::optional<SizeEstimation > estimateSize (
364380 const EncodingType encodingType,
365381 const size_t entryCount,
366382 const Statistics<physicalType>& statistics) {
0 commit comments