Skip to content

Commit 58cd140

Browse files
sdruzkinmeta-codesync[bot]
authored andcommitted
Prepare stats for optional Uniques (facebookincubator#293)
Summary: Pull Request resolved: facebookincubator#293 Prepare stats for optional Uniques by wrapping uniqueStats_ in additional std::optional. For now don't add any verification for missing uniques value in consumers assuming that for now it is going to always be present. [Strobelight](https://fburl.com/scuba/strobelight_services/yoali6ap) shows that 93% of estimateSize is spent in estimateNumericSize. And this is where we will be putting a threshold next after collecting a bit of data. Reviewed By: HuamengJiang Differential Revision: D85310781 fbshipit-source-id: d221315021cd819ed84365f594985064e4e9c590
1 parent f955818 commit 58cd140

File tree

9 files changed

+71
-66
lines changed

9 files changed

+71
-66
lines changed

dwio/nimble/encodings/ConstantEncoding.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ std::string_view ConstantEncoding<T>::encode(
124124
NIMBLE_INCOMPATIBLE_ENCODING("ConstantEncoding cannot be empty.");
125125
}
126126

127-
if (selection.statistics().uniqueCounts().size() != 1) {
127+
if (selection.statistics().uniqueCounts().value().size() != 1) {
128128
NIMBLE_INCOMPATIBLE_ENCODING("ConstantEncoding requires constant data.");
129129
}
130130

dwio/nimble/encodings/DictionaryEncoding.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,8 @@ std::string_view DictionaryEncoding<T>::encode(
200200
std::span<const physicalType> values,
201201
Buffer& buffer) {
202202
const uint32_t valueCount = values.size();
203-
const uint32_t alphabetCount = selection.statistics().uniqueCounts().size();
203+
const uint32_t alphabetCount =
204+
selection.statistics().uniqueCounts().value().size();
204205

205206
folly::F14FastMap<physicalType, uint32_t> alphabetMapping;
206207
alphabetMapping.reserve(alphabetCount);

dwio/nimble/encodings/EncodingSelectionPolicy.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ struct EncodingPredictionModel {
323323
// TODO: Utilize more features within statistics for prediction.
324324
auto maxRepeat = statistics.maxRepeat();
325325
auto minRepeat = statistics.minRepeat();
326-
auto unique = statistics.uniqueCounts().size();
326+
auto unique = statistics.uniqueCounts().value().size();
327327
return maxRepeatParam * maxRepeat + minRepeatParam * minRepeat +
328328
uniqueParam * unique;
329329
}

dwio/nimble/encodings/EncodingSizeEstimation.h

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ struct EncodingSizeEstimation {
3939
const Statistics<physicalType>& statistics) {
4040
switch (encodingType) {
4141
case EncodingType::Constant: {
42-
return statistics.uniqueCounts().size() == 1
42+
return statistics.uniqueCounts().value().size() == 1
4343
? std::optional<uint64_t>{getEncodingOverhead<
4444
EncodingType::Constant,
4545
physicalType>()}
@@ -59,8 +59,8 @@ struct EncodingSizeEstimation {
5959

6060
// Find most common item count
6161
const auto maxUniqueCount = std::max_element(
62-
statistics.uniqueCounts().cbegin(),
63-
statistics.uniqueCounts().cend(),
62+
statistics.uniqueCounts().value().cbegin(),
63+
statistics.uniqueCounts().value().cend(),
6464
[](const auto& a, const auto& b) { return a.second < b.second; });
6565
// Deduce uncommon values count
6666
const auto uncommonCount = entryCount - maxUniqueCount->second;
@@ -96,10 +96,10 @@ struct EncodingSizeEstimation {
9696
// Alphabet stored trivially.
9797
// Indices are stored bit-packed, with bit width needed to store max
9898
// dictionary size (which is the unique value count).
99-
const uint64_t indicesSize =
100-
bitPackedBytes(0, statistics.uniqueCounts().size(), entryCount);
99+
const uint64_t indicesSize = bitPackedBytes(
100+
0, statistics.uniqueCounts().value().size(), entryCount);
101101
const uint64_t alphabetSize =
102-
statistics.uniqueCounts().size() * sizeof(physicalType);
102+
statistics.uniqueCounts().value().size() * sizeof(physicalType);
103103
uint32_t overhead =
104104
getEncodingOverhead<EncodingType::Dictionary, physicalType>() +
105105
// Alphabet overhead
@@ -163,7 +163,7 @@ struct EncodingSizeEstimation {
163163
const Statistics<physicalType>& statistics) {
164164
switch (encodingType) {
165165
case EncodingType::Constant: {
166-
return statistics.uniqueCounts().size() == 1
166+
return statistics.uniqueCounts().value().size() == 1
167167
? std::optional<uint64_t>{getEncodingOverhead<
168168
EncodingType::Constant,
169169
physicalType>()}
@@ -175,8 +175,8 @@ struct EncodingSizeEstimation {
175175
// representing max entry count).
176176

177177
const auto exceptionCount = std::min(
178-
statistics.uniqueCounts().at(true),
179-
statistics.uniqueCounts().at(false));
178+
statistics.uniqueCounts().value().at(true),
179+
statistics.uniqueCounts().value().at(false));
180180
uint32_t overhead =
181181
getEncodingOverhead<EncodingType::SparseBool, physicalType>() +
182182
// Overhead for storing exception indices
@@ -216,7 +216,7 @@ struct EncodingSizeEstimation {
216216
const uint32_t maxStringSize = statistics.max().size();
217217
switch (encodingType) {
218218
case EncodingType::Constant: {
219-
return statistics.uniqueCounts().size() == 1
219+
return statistics.uniqueCounts().value().size() == 1
220220
? std::optional<uint64_t>{getEncodingOverhead<
221221
EncodingType::Constant,
222222
physicalType>(maxStringSize)}
@@ -238,13 +238,13 @@ struct EncodingSizeEstimation {
238238

239239
// Find the most common item count
240240
const auto maxUniqueCount = std::max_element(
241-
statistics.uniqueCounts().cbegin(),
242-
statistics.uniqueCounts().cend(),
241+
statistics.uniqueCounts().value().cbegin(),
242+
statistics.uniqueCounts().value().cend(),
243243
[](const auto& a, const auto& b) { return a.second < b.second; });
244244
// Get the total blob size for all (unique) strings
245245
const uint64_t alphabetByteSize = std::accumulate(
246-
statistics.uniqueCounts().cbegin(),
247-
statistics.uniqueCounts().cend(),
246+
statistics.uniqueCounts().value().cbegin(),
247+
statistics.uniqueCounts().value().cend(),
248248
0,
249249
[](const uint32_t sum, const auto& unique) {
250250
return sum + unique.first.size();
@@ -259,7 +259,7 @@ struct EncodingSizeEstimation {
259259
maxUniqueCount->first.size() +
260260
bitPackedBytes(statistics.min().size(),
261261
statistics.max().size(),
262-
statistics.uniqueCounts().size());
262+
statistics.uniqueCounts().value().size());
263263
// Uncommon values (sparse bool) bitmap will have index per value,
264264
// stored bit packed.
265265
const auto uncommonIndicesSize =
@@ -290,12 +290,12 @@ struct EncodingSizeEstimation {
290290
// Alphabet stored trivially.
291291
// Indices are stored bit-packed, with bit width needed to store max
292292
// dictionary size (which is the unique value count).
293-
const uint64_t indicesSize =
294-
bitPackedBytes(0, statistics.uniqueCounts().size(), entryCount);
293+
const uint64_t indicesSize = bitPackedBytes(
294+
0, statistics.uniqueCounts().value().size(), entryCount);
295295
// Get the total blob size for all (unique) strings
296296
const uint64_t alphabetByteSize = std::accumulate(
297-
statistics.uniqueCounts().cbegin(),
298-
statistics.uniqueCounts().cend(),
297+
statistics.uniqueCounts().value().cbegin(),
298+
statistics.uniqueCounts().value().cend(),
299299
0,
300300
[](const uint32_t sum, const auto& unique) {
301301
return sum + unique.first.size();
@@ -304,7 +304,7 @@ struct EncodingSizeEstimation {
304304
const uint64_t alphabetSize = alphabetByteSize +
305305
bitPackedBytes(statistics.min().size(),
306306
statistics.max().size(),
307-
statistics.uniqueCounts().size());
307+
statistics.uniqueCounts().value().size());
308308
uint32_t overhead =
309309
getEncodingOverhead<EncodingType::Dictionary, physicalType>(
310310
maxStringSize) +
@@ -324,8 +324,8 @@ struct EncodingSizeEstimation {
324324
uint64_t runValuesSize =
325325
// (unique) strings blob size
326326
std::accumulate(
327-
statistics.uniqueCounts().cbegin(),
328-
statistics.uniqueCounts().cend(),
327+
statistics.uniqueCounts().value().cbegin(),
328+
statistics.uniqueCounts().value().cend(),
329329
0,
330330
[](const uint32_t sum, const auto& unique) {
331331
return sum + unique.first.size();
@@ -338,7 +338,7 @@ struct EncodingSizeEstimation {
338338
// dictionary indices
339339
bitPackedBytes(
340340
0,
341-
statistics.uniqueCounts().size(),
341+
statistics.uniqueCounts().value().size(),
342342
statistics.consecutiveRepeatCount());
343343
const auto runLengthsSize = bitPackedBytes(
344344
statistics.minRepeat(),

dwio/nimble/encodings/MainlyConstantEncoding.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -345,8 +345,8 @@ std::string_view MainlyConstantEncoding<T>::encode(
345345
}
346346

347347
const auto commonElement = std::max_element(
348-
selection.statistics().uniqueCounts().cbegin(),
349-
selection.statistics().uniqueCounts().cend(),
348+
selection.statistics().uniqueCounts().value().cbegin(),
349+
selection.statistics().uniqueCounts().value().cend(),
350350
[](const auto& a, const auto& b) { return a.second < b.second; });
351351

352352
const uint32_t entryCount = values.size();

dwio/nimble/encodings/SparseBoolEncoding.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,11 @@ std::string_view SparseBoolEncoding::encode(
9494
std::span<const bool> values,
9595
Buffer& buffer) {
9696
// Decide the polarity of the encoding.
97-
const uint32_t valueCount = values.size();
98-
const uint32_t setCount = selection.statistics().uniqueCounts().at(true);
97+
const uint64_t valueCount = values.size();
98+
const uint64_t setCount =
99+
selection.statistics().uniqueCounts().value().at(true);
99100
bool sparseValue;
100-
uint32_t indexCount;
101+
uint64_t indexCount;
101102
if (setCount > (valueCount >> 1)) {
102103
sparseValue = false;
103104
indexCount = valueCount - setCount;

dwio/nimble/encodings/Statistics.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
#include <algorithm>
2020
#include <limits>
21-
#include <memory>
2221
#include <type_traits>
2322

2423
namespace facebook::nimble {
@@ -128,7 +127,7 @@ void Statistics<T, InputType>::populateUniques() const {
128127
++uniqueCounts[data_[i]];
129128
}
130129
}
131-
uniqueCounts_.emplace(std::move(uniqueCounts));
130+
uniqueCounts_.emplace(std::make_optional(std::move(uniqueCounts)));
132131
}
133132

134133
template <typename T, typename InputType>
@@ -192,7 +191,8 @@ Statistics<T, InputType> Statistics<T, InputType>::create(
192191
statistics.max_ = T();
193192

194193
statistics.bucketCounts_ = {};
195-
statistics.uniqueCounts_ = {};
194+
statistics.uniqueCounts_ = std::make_optional(
195+
std::make_optional(UniqueValueCounts<T, InputType>()));
196196
return statistics;
197197
}
198198

dwio/nimble/encodings/Statistics.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
#pragma once
1717

1818
#include <limits>
19-
#include <memory>
2019
#include <optional>
2120
#include <span>
2221
#include <type_traits>
@@ -179,7 +178,8 @@ class Statistics {
179178
return bucketCounts_.value();
180179
}
181180

182-
const UniqueValueCounts<T, InputType>& uniqueCounts() const noexcept {
181+
const std::optional<UniqueValueCounts<T, InputType>>& uniqueCounts()
182+
const noexcept {
183183
if (!uniqueCounts_.has_value()) {
184184
populateUniques();
185185
}
@@ -204,7 +204,8 @@ class Statistics {
204204
mutable std::optional<T> min_;
205205
mutable std::optional<T> max_;
206206
mutable std::optional<std::vector<uint64_t>> bucketCounts_;
207-
mutable std::optional<UniqueValueCounts<T, InputType>> uniqueCounts_;
207+
mutable std::optional<std::optional<UniqueValueCounts<T, InputType>>>
208+
uniqueCounts_;
208209
};
209210

210211
} // namespace facebook::nimble

0 commit comments

Comments
 (0)