Skip to content

Commit f67af15

Browse files
authored
chore: minor fixes around huffman encoding (#6037)
Instead of limiting each count in the histogram, we limit and normalize by the histogram sum. Signed-off-by: Roman Gershman <[email protected]>
1 parent edf431f commit f67af15

File tree

2 files changed

+12
-12
lines changed

2 files changed

+12
-12
lines changed

src/core/dfly_core_test.cc

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -405,11 +405,14 @@ TEST_F(HuffCoderTest, HugeHistogram) {
405405
// The bug is in the following code in huf_compress.c:
406406
// huffNode0[0].count = (U32)(1U<<31); /* fake entry, strong barrier */
407407
// where it uses the count as a sentinel assuming that no other counts can be larger than 2^31.
408-
// this may not be true for histograms with huge counts, so we need to make sure that
409-
// all counts are smaller than 2^31 / 256.
408+
// this may not be true for histograms with huge counts, so we need to make sure that sum of all
409+
// counts is smaller than 2^31.
410+
uint64_t sum = 0;
410411
for (unsigned i = 0; i < hist.size(); ++i) {
411-
hist[i] >>= 2; // Without this the algorithm causes a data race and crash.
412+
sum += hist[i];
413+
hist[i] /= 4; // Without this the algorithm causes a data race and crash.
412414
}
415+
LOG(INFO) << "Total sum: " << sum << " reduced sum: " << sum / 4;
413416
ASSERT_TRUE(encoder_.Build(hist.data(), hist.size() - 1, &error_msg_)) << error_msg_;
414417

415418
string bindata = encoder_.Export();

src/server/debugcmd.cc

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -283,8 +283,8 @@ unsigned HufHist::MaxFreqCount() const {
283283
return max_freq;
284284
}
285285

286-
unsigned kMaxFreqPerShard = 1U << 19;
287-
unsigned kMaxFreqTotal = 1U << 23;
286+
unsigned kMaxFreqPerShard = 1U << 20;
287+
unsigned kMaxFreqTotal = static_cast<unsigned>((1U << 31) * 0.9);
288288

289289
void DoComputeHist(CompactObjType type, EngineShard* shard, ConnectionContext* cntx,
290290
HufHist* dest) {
@@ -1403,23 +1403,20 @@ void DebugCmd::Values(CmdArgList args, facade::SinkReplyBuilder* builder) {
14031403
}
14041404

14051405
static size_t PostProcessHist(HufHist* dest) {
1406-
size_t raw_size = 0;
1406+
size_t total_freq = 0;
14071407
auto& hist = dest->hist;
14081408
unsigned max_freq = 0;
14091409

14101410
for (unsigned i = 0; i <= HufHist::kMaxSymbol; i++) {
14111411
// raw_size may count less characters than the actual size because
14121412
// we may cut the counting early.
1413-
raw_size += hist[i];
1414-
if (hist[i] > max_freq) {
1415-
max_freq = hist[i];
1416-
}
1413+
total_freq += hist[i];
14171414
if (hist[i] == 0) {
14181415
hist[i] = 1; // Avoid zero frequency symbols.
14191416
}
14201417
}
14211418

1422-
if (max_freq > kMaxFreqTotal) {
1419+
if (total_freq > kMaxFreqTotal) {
14231420
// huffman encoder has a bug with frequencies too high, so we scale down everything
14241421
// to avoid overflow.
14251422
double scale = static_cast<double>(max_freq) / kMaxFreqTotal;
@@ -1430,7 +1427,7 @@ static size_t PostProcessHist(HufHist* dest) {
14301427
}
14311428
}
14321429
}
1433-
return raw_size;
1430+
return total_freq;
14341431
}
14351432

14361433
void DebugCmd::Compression(CmdArgList args, facade::SinkReplyBuilder* builder) {

0 commit comments

Comments
 (0)