Skip to content

Commit 7e3cce9

Browse files
authored
chore: support huffman encoding for string values (#5815)
* chore: support huffman encoding for string values 1. Differrentiate huffman encoding for strings and values. 2. Fix "debug compression" to set huffman table for both domains. --------- Signed-off-by: Roman Gershman <[email protected]>
1 parent 47445e6 commit 7e3cce9

File tree

5 files changed

+100
-58
lines changed

5 files changed

+100
-58
lines changed

src/core/compact_object.cc

Lines changed: 52 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,10 @@ struct TL {
366366
size_t small_str_bytes;
367367
Huffman huff_keys, huff_string_values;
368368
uint64_t huff_encode_total = 0, huff_encode_success = 0; // success/total metrics.
369+
370+
const HuffmanDecoder& GetHuffmanDecoder(uint8_t huffman_domain) const {
371+
return huffman_domain == CompactObj::HUFF_KEYS ? huff_keys.decoder : huff_string_values.decoder;
372+
}
369373
};
370374

371375
thread_local TL tl;
@@ -606,10 +610,9 @@ int RobjWrapper::ZsetAdd(double score, std::string_view ele, int in_flags, int*
606610
bool gt = (in_flags & ZADD_IN_GT) != 0;
607611
bool lt = (in_flags & ZADD_IN_LT) != 0;
608612

609-
unsigned char* eptr;
610613
uint8_t* lp = (uint8_t*)inner_obj_;
611-
612-
if ((eptr = ZzlFind(lp, ele, &curscore)) != NULL) {
614+
uint8_t* eptr = ZzlFind(lp, ele, &curscore);
615+
if (eptr != NULL) {
613616
/* NX? Return, same element already exists. */
614617
if (nx) {
615618
*out_flags |= ZADD_OUT_NOP;
@@ -774,8 +777,10 @@ CompactObj& CompactObj::operator=(CompactObj&& o) noexcept {
774777
SetMeta(o.taglen_, o.mask_); // Frees underlying resources if needed.
775778
memcpy(&u_, &o.u_, sizeof(u_));
776779

780+
tagbyte_ = o.tagbyte_;
781+
777782
// SetMeta deallocates the object and we only want reset it.
778-
o.taglen_ = 0;
783+
o.tagbyte_ = 0;
779784
o.mask_ = 0;
780785

781786
return *this;
@@ -1012,7 +1017,7 @@ void CompactObj::SetString(std::string_view str, bool is_key) {
10121017
}
10131018
}
10141019

1015-
EncodeString(str);
1020+
EncodeString(str, is_key);
10161021
}
10171022

10181023
void CompactObj::ReserveString(size_t size) {
@@ -1141,7 +1146,8 @@ void CompactObj::GetString(char* dest) const {
11411146
next += slices[0].size() - 1;
11421147
memcpy(next, slices[1].data(), slices[1].size());
11431148
string_view src(reinterpret_cast<const char*>(tl.tmp_buf.data()), tl.tmp_buf.size());
1144-
CHECK(tl.huff_keys.decoder.Decode(src, decoded_len, dest));
1149+
const auto& decoder = tl.GetHuffmanDecoder(huffman_domain_);
1150+
CHECK(decoder.Decode(src, decoded_len, dest));
11451151
return;
11461152
}
11471153

@@ -1237,15 +1243,15 @@ void CompactObj::Materialize(std::string_view blob, bool is_raw) {
12371243
u_.r_obj.SetString(blob, tl.local_mr);
12381244
}
12391245
} else {
1240-
EncodeString(blob);
1246+
EncodeString(blob, false);
12411247
}
12421248
}
12431249

12441250
void CompactObj::Reset() {
12451251
if (HasAllocated()) {
12461252
Free();
12471253
}
1248-
taglen_ = 0;
1254+
tagbyte_ = 0;
12491255
mask_ = 0;
12501256
}
12511257

@@ -1355,7 +1361,8 @@ bool CompactObj::CmpEncoded(string_view sv) const {
13551361
constexpr size_t kMaxHuffLen = kInlineLen * 3;
13561362
if (sz <= kMaxHuffLen) {
13571363
char buf[kMaxHuffLen];
1358-
CHECK(tl.huff_keys.decoder.Decode({u_.inline_str + 1, size_t(taglen_ - 1)}, sz, buf));
1364+
const auto& decoder = tl.GetHuffmanDecoder(huffman_domain_);
1365+
CHECK(decoder.Decode({u_.inline_str + 1, size_t(taglen_ - 1)}, sz, buf));
13591366
return sv == string_view(buf, sz);
13601367
}
13611368
}
@@ -1437,7 +1444,7 @@ bool CompactObj::CmpEncoded(string_view sv) const {
14371444
return false;
14381445
}
14391446

1440-
void CompactObj::EncodeString(string_view str) {
1447+
void CompactObj::EncodeString(string_view str, bool is_key) {
14411448
DCHECK_GT(str.size(), kInlineLen);
14421449
DCHECK_EQ(NONE_ENC, mask_bits_.encoding);
14431450

@@ -1447,6 +1454,7 @@ void CompactObj::EncodeString(string_view str) {
14471454
// We chose such length that we can store the decoded length delta into 1 byte.
14481455
// The maximum huffman compression is 1/8, so 288 / 8 = 36.
14491456
// 288 - 36 = 252, which is smaller than 256.
1457+
// TODO: introduce variable length huffman length.
14501458
constexpr unsigned kMaxHuffLen = 288;
14511459

14521460
// For sizes 17, 18 we would like to test ascii encoding first as it's more efficient.
@@ -1455,34 +1463,38 @@ void CompactObj::EncodeString(string_view str) {
14551463
kUseAsciiEncoding && str.size() < 19 && detail::validate_ascii_fast(str.data(), str.size());
14561464

14571465
// if !is_ascii, we try huffman encoding next.
1458-
if (!is_ascii && str.size() <= kMaxHuffLen && tl.huff_keys.encoder.valid()) {
1459-
unsigned dest_len = tl.huff_keys.encoder.CompressedBound(str.size());
1460-
// 1 byte for storing the size delta.
1461-
tl.tmp_buf.resize(1 + dest_len);
1462-
string err_msg;
1463-
++tl.huff_encode_total;
1464-
bool res = tl.huff_keys.encoder.Encode(str, tl.tmp_buf.data() + 1, &dest_len, &err_msg);
1465-
if (res) {
1466-
// we accept huffman encoding only if it is:
1467-
// 1. smaller than the original string by 20%
1468-
// 2. allows us to store the encoded string in the inline buffer
1469-
if (dest_len && (dest_len < kInlineLen || (dest_len + dest_len / 5) < str.size())) {
1470-
huff_encoded = true;
1471-
tl.huff_encode_success++;
1472-
encoded = string_view{reinterpret_cast<char*>(tl.tmp_buf.data()), dest_len + 1};
1473-
unsigned delta = str.size() - dest_len;
1474-
DCHECK_LT(delta, 256u);
1475-
tl.tmp_buf[0] = static_cast<uint8_t>(delta);
1476-
mask_bits_.encoding = HUFFMAN_ENC;
1477-
if (encoded.size() <= kInlineLen) {
1478-
SetMeta(encoded.size(), mask_);
1479-
memcpy(u_.inline_str, tl.tmp_buf.data(), encoded.size());
1480-
return;
1466+
if (!is_ascii && str.size() <= kMaxHuffLen) {
1467+
auto& huffman = is_key ? tl.huff_keys : tl.huff_string_values;
1468+
if (huffman.encoder.valid()) {
1469+
unsigned dest_len = huffman.encoder.CompressedBound(str.size());
1470+
// 1 byte for storing the size delta.
1471+
tl.tmp_buf.resize(1 + dest_len);
1472+
string err_msg;
1473+
++tl.huff_encode_total;
1474+
bool res = huffman.encoder.Encode(str, tl.tmp_buf.data() + 1, &dest_len, &err_msg);
1475+
if (res) {
1476+
// we accept huffman encoding only if it is:
1477+
// 1. smaller than the original string by 20%
1478+
// 2. allows us to store the encoded string in the inline buffer
1479+
if (dest_len && (dest_len < kInlineLen || (dest_len + dest_len / 5) < str.size())) {
1480+
huff_encoded = true;
1481+
tl.huff_encode_success++;
1482+
encoded = string_view{reinterpret_cast<char*>(tl.tmp_buf.data()), dest_len + 1};
1483+
unsigned delta = str.size() - dest_len;
1484+
DCHECK_LT(delta, 256u);
1485+
tl.tmp_buf[0] = static_cast<uint8_t>(delta);
1486+
mask_bits_.encoding = HUFFMAN_ENC;
1487+
huffman_domain_ = is_key ? HUFF_KEYS : HUFF_STRING_VALUES;
1488+
if (encoded.size() <= kInlineLen) {
1489+
SetMeta(encoded.size(), mask_);
1490+
memcpy(u_.inline_str, tl.tmp_buf.data(), encoded.size());
1491+
return;
1492+
}
14811493
}
1494+
} else {
1495+
// Should not happen, means we have an internal buf.
1496+
LOG(DFATAL) << "Failed to encode string with huffman: " << err_msg;
14821497
}
1483-
} else {
1484-
// Should not happen, means we have an internal buf.
1485-
LOG(DFATAL) << "Failed to encode string with huffman: " << err_msg;
14861498
}
14871499
}
14881500

@@ -1609,9 +1621,11 @@ size_t CompactObj::StrEncoding::Decode(std::string_view blob, char* dest) const
16091621
case ASCII2_ENC:
16101622
detail::ascii_unpack(reinterpret_cast<const uint8_t*>(blob.data()), decoded_len, dest);
16111623
break;
1612-
case HUFFMAN_ENC:
1613-
tl.huff_keys.decoder.Decode(blob.substr(1), decoded_len, dest);
1624+
case HUFFMAN_ENC: {
1625+
const auto& decoder = tl.GetHuffmanDecoder(is_key_);
1626+
decoder.Decode(blob.substr(1), decoded_len, dest);
16141627
break;
1628+
}
16151629
};
16161630
return decoded_len;
16171631
}

src/core/compact_object.h

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ class CompactObj {
135135
NONE_ENC = 0,
136136
ASCII1_ENC = 1,
137137
ASCII2_ENC = 2,
138-
HUFFMAN_ENC = 3, // TBD
138+
HUFFMAN_ENC = 3,
139139
};
140140

141141
public:
@@ -147,12 +147,14 @@ class CompactObj {
147147

148148
private:
149149
friend class CompactObj;
150-
explicit StrEncoding(uint8_t enc) : enc_(static_cast<EncodingEnum>(enc)) {
150+
explicit StrEncoding(uint8_t enc, bool is_key)
151+
: enc_(static_cast<EncodingEnum>(enc)), is_key_(is_key) {
151152
}
152153

153154
size_t DecodedSize(size_t compr_size, uint8_t first_byte) const;
154155

155156
EncodingEnum enc_;
157+
bool is_key_;
156158
};
157159

158160
using PrefixArray = std::vector<std::string_view>;
@@ -183,7 +185,7 @@ class CompactObj {
183185
CompactObj AsRef() const {
184186
CompactObj res;
185187
memcpy(&res.u_, &u_, sizeof(u_));
186-
res.taglen_ = taglen_;
188+
res.tagbyte_ = tagbyte_;
187189
res.mask_ = mask_;
188190
res.mask_bits_.ref = 1;
189191

@@ -429,7 +431,7 @@ class CompactObj {
429431
StringOrView GetRawString() const;
430432

431433
StrEncoding GetStrEncoding() const {
432-
return StrEncoding{mask_bits_.encoding};
434+
return StrEncoding{mask_bits_.encoding, bool(huffman_domain_)};
433435
}
434436

435437
bool HasAllocated() const;
@@ -441,7 +443,7 @@ class CompactObj {
441443
}
442444

443445
private:
444-
void EncodeString(std::string_view str);
446+
void EncodeString(std::string_view str, bool is_key);
445447

446448
bool EqualNonInline(std::string_view sv) const;
447449

@@ -544,7 +546,14 @@ class CompactObj {
544546
};
545547

546548
// We currently reserve 5 bits for tags and 3 bits for extending the mask. currently reserved.
547-
uint8_t taglen_ = 0;
549+
union {
550+
uint8_t tagbyte_ = 0;
551+
struct {
552+
uint8_t taglen_ : 5;
553+
uint8_t huffman_domain_ : 1; // value from HuffmanDomain enum.
554+
uint8_t reserved : 2;
555+
};
556+
};
548557
};
549558

550559
inline bool CompactObj::operator==(std::string_view sv) const {

src/core/compact_object_test.cc

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -665,23 +665,25 @@ static void BuildEncoderAB(HuffmanEncoder* encoder) {
665665
CHECK(encoder->Build(hist.data(), hist.size() - 1, nullptr));
666666
}
667667

668-
TEST_F(CompactObjectTest, HuffMan) {
668+
TEST_F(CompactObjectTest, Huffman) {
669669
HuffmanEncoder encoder;
670670
BuildEncoderAB(&encoder);
671671
string bindata = encoder.Export();
672-
ASSERT_TRUE(CompactObj::InitHuffmanThreadLocal(CompactObj::HUFF_KEYS, bindata));
673-
for (unsigned i = 30; i < 2048; i += 10) {
674-
string data(i, 'a');
675-
cobj_.SetString(data, true);
676-
bool malloc_used = i >= 60;
677-
ASSERT_EQ(malloc_used, cobj_.MallocUsed() > 0) << i;
678-
ASSERT_EQ(data.size(), cobj_.Size());
679-
ASSERT_EQ(CompactObj::HashCode(data), cobj_.HashCode());
680-
681-
string actual;
682-
cobj_.GetString(&actual);
683-
EXPECT_EQ(data, actual);
684-
EXPECT_EQ(cobj_, data);
672+
for (CompactObj::HuffmanDomain domain : {CompactObj::HUFF_KEYS, CompactObj::HUFF_STRING_VALUES}) {
673+
ASSERT_TRUE(CompactObj::InitHuffmanThreadLocal(domain, bindata));
674+
for (unsigned i = 30; i < 2048; i += 10) {
675+
string data(i, 'a');
676+
cobj_.SetString(data, domain == CompactObj::HUFF_KEYS);
677+
bool malloc_used = i >= 60;
678+
ASSERT_EQ(malloc_used, cobj_.MallocUsed() > 0) << i;
679+
ASSERT_EQ(data.size(), cobj_.Size());
680+
ASSERT_EQ(CompactObj::HashCode(data), cobj_.HashCode());
681+
682+
string actual;
683+
cobj_.GetString(&actual);
684+
EXPECT_EQ(data, actual);
685+
EXPECT_EQ(cobj_, data);
686+
}
685687
}
686688
}
687689

src/server/debugcmd.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1436,6 +1436,7 @@ void DebugCmd::Compression(CmdArgList args, facade::SinkReplyBuilder* builder) {
14361436
if (type != OBJ_STRING) { // Currently only string type is supported.
14371437
return builder->SendError(kSyntaxErr);
14381438
}
1439+
domain = CompactObj::HUFF_STRING_VALUES;
14391440
}
14401441
shard_set->RunBriefInParallel([&](EngineShard* shard) {
14411442
if (!CompactObj::InitHuffmanThreadLocal(domain, raw)) {

src/server/dragonfly_test.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -937,6 +937,22 @@ TEST_F(DflyEngineTest, CommandMetricLabels) {
937937
EXPECT_EQ(metrics.facade_stats.conn_stats.num_conns_other, 0);
938938
}
939939

940+
TEST_F(DflyEngineTest, Huffman) {
941+
// enable compression for keys optimized for letter a.
942+
auto resp = Run({"debug", "compression", "set", "GBDgCpXW/////7/pygS5t9x7792qU1trLQ=="});
943+
EXPECT_EQ(resp, "OK");
944+
945+
// for string values optimized for letter x.
946+
resp = Run({"debug", "compression", "set", "ChD4bAf/D/bPSwY=", "string"});
947+
EXPECT_EQ(resp, "OK");
948+
resp = Run({"debug", "populate", "200000", "aaaaaaaaaaaaaaaaaaaaaaaaaa", "32"});
949+
EXPECT_EQ(resp, "OK");
950+
951+
auto metrics = GetMetrics();
952+
EXPECT_EQ(metrics.events.huff_encode_success, 400000); // each key and value
953+
EXPECT_LT(metrics.heap_used_bytes, 14'000'000); // less than 15mb
954+
}
955+
940956
class DflyCommandAliasTest : public DflyEngineTest {
941957
protected:
942958
DflyCommandAliasTest() {

0 commit comments

Comments
 (0)