Skip to content

Commit acf5f19

Browse files
authored
chore: Refactor CompactObject encoding (#5342)
Refactor CompactObject encoding to use StrEncoding helper object
1 parent dab3e1e commit acf5f19

File tree

5 files changed

+110
-98
lines changed

5 files changed

+110
-98
lines changed

src/core/compact_object.cc

Lines changed: 58 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -786,9 +786,7 @@ size_t CompactObj::Size() const {
786786
switch (taglen_) {
787787
case SMALL_TAG:
788788
raw_size = u_.small_str.size();
789-
if (mask_bits_.encoding == HUFFMAN_ENC) {
790-
return DecodedLen(raw_size, u_.small_str.first_byte());
791-
}
789+
first_byte = u_.small_str.first_byte();
792790
break;
793791
case INT_TAG: {
794792
absl::AlphaNum an(u_.ival);
@@ -801,9 +799,7 @@ size_t CompactObj::Size() const {
801799
break;
802800
case ROBJ_TAG:
803801
raw_size = u_.r_obj.Size();
804-
if (mask_bits_.encoding == HUFFMAN_ENC) {
805-
return DecodedLen(raw_size, *(uint8_t*)u_.r_obj.inner_obj());
806-
}
802+
first_byte = *(uint8_t*)u_.r_obj.inner_obj();
807803
break;
808804
case JSON_TAG:
809805
DCHECK_EQ(mask_bits_.encoding, NONE_ENC);
@@ -821,7 +817,7 @@ size_t CompactObj::Size() const {
821817
LOG(DFATAL) << "Should not reach " << int(taglen_);
822818
}
823819
}
824-
return mask_bits_.encoding ? DecodedLen(raw_size, first_byte) : raw_size;
820+
return GetStrEncoding().DecodedSize(raw_size, first_byte);
825821
}
826822

827823
uint64_t CompactObj::HashCode() const {
@@ -848,16 +844,8 @@ uint64_t CompactObj::HashCode() const {
848844

849845
if (IsInline()) {
850846
char buf[kInlineLen * 3]; // should suffice for most huffman decodings.
851-
size_t decoded_len = DecodedLen(taglen_, u_.inline_str[0]);
852-
if (mask_bits_.encoding == HUFFMAN_ENC) {
853-
if (decoded_len <= sizeof(buf) &&
854-
tl.huff_keys.decoder.Decode({u_.inline_str + 1, size_t(taglen_ - 1)}, decoded_len, buf)) {
855-
return XXH3_64bits_withSeed(buf, decoded_len, kHashSeed);
856-
}
857-
} else {
858-
detail::ascii_unpack(to_byte(u_.inline_str), decoded_len, buf);
859-
return XXH3_64bits_withSeed(buf, decoded_len, kHashSeed);
860-
}
847+
size_t decoded_len = GetStrEncoding().Decode(string_view{u_.inline_str, taglen_}, buf);
848+
return XXH3_64bits_withSeed(buf, decoded_len, kHashSeed);
861849
}
862850

863851
string_view sv = GetSlice(&tl.tmp_str);
@@ -1114,21 +1102,7 @@ void CompactObj::GetString(char* dest) const {
11141102
CHECK(!IsExternal());
11151103

11161104
if (IsInline()) {
1117-
switch (mask_bits_.encoding) {
1118-
case ASCII2_ENC:
1119-
DCHECK_EQ(taglen_ + 2u, ascii_len(taglen_));
1120-
detail::ascii_unpack(to_byte(u_.inline_str), taglen_ + 2, dest);
1121-
break;
1122-
case HUFFMAN_ENC:
1123-
tl.huff_keys.decoder.Decode({u_.inline_str + 1, size_t(taglen_ - 1)},
1124-
u_.inline_str[0] + taglen_ - 1, dest);
1125-
break;
1126-
case NONE_ENC:
1127-
memcpy(dest, u_.inline_str, taglen_);
1128-
break;
1129-
default:
1130-
DLOG(FATAL) << "should not reach " << int(mask_bits_.encoding);
1131-
}
1105+
GetStrEncoding().Decode({u_.inline_str, taglen_}, dest);
11321106
return;
11331107
}
11341108

@@ -1142,19 +1116,15 @@ void CompactObj::GetString(char* dest) const {
11421116
if (taglen_ == ROBJ_TAG) {
11431117
CHECK_EQ(OBJ_STRING, u_.r_obj.type());
11441118
DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding());
1145-
size_t decoded_len = DecodedLen(u_.r_obj.Size(), *(const uint8_t*)u_.r_obj.inner_obj());
1146-
if (mask_bits_.encoding == HUFFMAN_ENC) {
1147-
CHECK(tl.huff_keys.decoder.Decode(
1148-
{(const char*)u_.r_obj.inner_obj() + 1, u_.r_obj.Size() - 1}, decoded_len, dest));
1149-
return;
1150-
}
1151-
detail::ascii_unpack_simd(to_byte(u_.r_obj.inner_obj()), decoded_len, dest);
1119+
string_view blob{(const char*)u_.r_obj.inner_obj(), u_.r_obj.Size()};
1120+
GetStrEncoding().Decode(blob, dest);
1121+
return;
11521122
} else {
11531123
CHECK_EQ(SMALL_TAG, taglen_);
11541124
string_view slices[2];
11551125
unsigned num = u_.small_str.GetV(slices);
11561126
DCHECK_EQ(2u, num);
1157-
size_t decoded_len = DecodedLen(u_.small_str.size(), slices[0][0]);
1127+
size_t decoded_len = GetStrEncoding().DecodedSize(u_.small_str.size(), slices[0][0]);
11581128

11591129
if (mask_bits_.encoding == HUFFMAN_ENC) {
11601130
tl.tmp_buf.resize(slices[0].size() + slices[1].size() - 1);
@@ -1575,15 +1545,6 @@ StringOrView CompactObj::GetRawString() const {
15751545
return {};
15761546
}
15771547

1578-
size_t CompactObj::DecodedLen(size_t sz, uint8_t b) const {
1579-
DCHECK(mask_bits_.encoding);
1580-
if (mask_bits_.encoding == HUFFMAN_ENC) {
1581-
return sz + b - 1;
1582-
}
1583-
unsigned delta = (mask_bits_.encoding == ASCII1_ENC) ? 1 : 0;
1584-
return ascii_len(sz) - delta;
1585-
}
1586-
15871548
MemoryResource* CompactObj::memory_resource() {
15881549
return tl.local_mr;
15891550
}
@@ -1613,4 +1574,52 @@ CompactObjType ObjTypeFromString(std::string_view sv) {
16131574
return kInvalidCompactObjType;
16141575
}
16151576

1577+
size_t CompactObj::StrEncoding::DecodedSize(string_view blob) const {
1578+
return DecodedSize(blob.size(), blob[0]);
1579+
}
1580+
1581+
size_t CompactObj::StrEncoding::DecodedSize(size_t blob_size, uint8_t first_byte) const {
1582+
switch (enc_) {
1583+
case NONE_ENC:
1584+
return blob_size;
1585+
case ASCII1_ENC:
1586+
case ASCII2_ENC:
1587+
return ascii_len(blob_size) - (enc_ == ASCII1_ENC);
1588+
case HUFFMAN_ENC:
1589+
return blob_size + int(first_byte) - 1;
1590+
};
1591+
return 0;
1592+
}
1593+
1594+
size_t CompactObj::StrEncoding::Decode(std::string_view blob, char* dest) const {
1595+
size_t decoded_len = DecodedSize(blob);
1596+
switch (enc_) {
1597+
case NONE_ENC:
1598+
memcpy(dest, blob.data(), blob.size());
1599+
break;
1600+
case ASCII1_ENC:
1601+
case ASCII2_ENC:
1602+
detail::ascii_unpack(reinterpret_cast<const uint8_t*>(blob.data()), decoded_len, dest);
1603+
break;
1604+
case HUFFMAN_ENC:
1605+
tl.huff_keys.decoder.Decode(blob.substr(1), decoded_len, dest);
1606+
break;
1607+
};
1608+
return decoded_len;
1609+
}
1610+
1611+
StringOrView CompactObj::StrEncoding::Decode(std::string_view blob) const {
1612+
switch (enc_) {
1613+
case NONE_ENC:
1614+
return StringOrView::FromView(blob);
1615+
default: {
1616+
string out;
1617+
out.resize(DecodedSize(blob));
1618+
Decode(blob, out.data());
1619+
return StringOrView::FromString(std::move(out));
1620+
}
1621+
}
1622+
return {};
1623+
}
1624+
16161625
} // namespace dfly

src/core/compact_object.h

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,14 +130,30 @@ class CompactObj {
130130
// Therefore, in order to know the original length we introduce 2 states that
131131
// correct the length upon decoding. ASCII1_ENC rounds down the decoded length,
132132
// while ASCII2_ENC rounds it up. See DecodedLen implementation for more info.
133-
enum Encoding : uint8_t {
133+
enum EncodingEnum : uint8_t {
134134
NONE_ENC = 0,
135135
ASCII1_ENC = 1,
136136
ASCII2_ENC = 2,
137137
HUFFMAN_ENC = 3, // TBD
138138
};
139139

140140
public:
141+
// Utility class for working with different string encodings (ascii, huffman, etc)
142+
struct StrEncoding {
143+
size_t DecodedSize(std::string_view blob) const; // Size of decoded blob
144+
size_t Decode(std::string_view blob, char* dest) const; // Decode into dest, return size
145+
StringOrView Decode(std::string_view blob) const;
146+
147+
private:
148+
friend class CompactObj;
149+
explicit StrEncoding(uint8_t enc) : enc_(static_cast<EncodingEnum>(enc)) {
150+
}
151+
152+
size_t DecodedSize(size_t compr_size, uint8_t first_byte) const;
153+
154+
EncodingEnum enc_;
155+
};
156+
141157
using PrefixArray = std::vector<std::string_view>;
142158
using MemoryResource = detail::RobjWrapper::MemoryResource;
143159

@@ -406,6 +422,10 @@ class CompactObj {
406422
// Precondition: the object is a non-inline string.
407423
StringOrView GetRawString() const;
408424

425+
StrEncoding GetStrEncoding() const {
426+
return StrEncoding{mask_bits_.encoding};
427+
}
428+
409429
bool HasAllocated() const;
410430

411431
bool TagAllowsEmptyValue() const;
@@ -416,7 +436,6 @@ class CompactObj {
416436

417437
private:
418438
void EncodeString(std::string_view str);
419-
size_t DecodedLen(size_t sz, uint8_t firstb) const;
420439

421440
bool EqualNonInline(std::string_view sv) const;
422441

src/core/compact_object_test.cc

Lines changed: 21 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <mimalloc.h>
99
#include <xxhash.h>
1010

11+
#include <cstddef>
1112
#include <random>
1213

1314
#include "base/gtest.h"
@@ -16,6 +17,7 @@
1617
#include "core/flat_set.h"
1718
#include "core/huff_coder.h"
1819
#include "core/mi_memory_resource.h"
20+
#include "core/string_or_view.h"
1921
#include "core/string_set.h"
2022

2123
extern "C" {
@@ -593,34 +595,28 @@ TEST_F(CompactObjectTest, DefragSet) {
593595
ASSERT_FALSE(cobj_.DefragIfNeeded(0.8));
594596
}
595597

596-
TEST_F(CompactObjectTest, RawInterface) {
597-
string str(50, 'a'), tmp, owned;
598-
cobj_.SetString(str);
599-
{
600-
auto raw_blob = cobj_.GetRawString();
601-
EXPECT_LT(raw_blob.view().size(), str.size());
598+
TEST_F(CompactObjectTest, StrEncodingAndMaterialize) {
599+
for (bool ascii : {true, false}) {
600+
for (size_t len : {64, 128, 256, 512, 1024}) {
601+
string test_str(len, 'a');
602+
for (size_t i = 0; i < len; i++)
603+
test_str[i] = char('a' + (i % 10));
604+
if (!ascii)
605+
test_str.push_back(char(200)); // non-ascii
602606

603-
raw_blob.MakeOwned();
604-
cobj_.SetExternal(0, 10); // dummy external pointer
605-
cobj_.Materialize(raw_blob.view(), true);
607+
CompactObj obj;
608+
obj.SetString(test_str);
606609

607-
EXPECT_EQ(str, cobj_.GetSlice(&tmp));
608-
}
609-
610-
str.assign(50, char(200)); // non ascii
611-
cobj_.SetString(str);
612-
ASSERT_EQ(str, cobj_.GetSlice(&tmp));
613-
614-
{
615-
auto raw_blob = cobj_.GetRawString();
610+
// Test StrEncoding helper
611+
string raw_str = obj.GetRawString().Take();
612+
CompactObj::StrEncoding enc = obj.GetStrEncoding();
613+
EXPECT_EQ(test_str, enc.Decode(raw_str).Take());
616614

617-
EXPECT_EQ(raw_blob.view(), str);
618-
619-
raw_blob.MakeOwned();
620-
cobj_.SetExternal(0, 10); // dummy external pointer
621-
cobj_.Materialize(raw_blob.view(), true);
622-
623-
EXPECT_EQ(str, cobj_.GetSlice(&tmp));
615+
// Test Materialize
616+
obj.SetExternal(0, 0); // dummy values
617+
obj.Materialize(raw_str, true);
618+
EXPECT_EQ(test_str, obj.ToString());
619+
}
624620
}
625621
}
626622

src/core/string_or_view.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@ class StringOrView {
6565
val_ = std::string{std::get<std::string_view>(val_)};
6666
}
6767

68+
std::string Take() && {
69+
MakeOwned();
70+
return std::move(std::get<std::string>(val_));
71+
}
72+
6873
bool empty() const {
6974
return visit([](const auto& s) { return s.empty(); }, val_);
7075
}

src/server/tiered_storage.cc

Lines changed: 5 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -60,16 +60,6 @@ void RecordDeleted(const PrimeValue& pv, size_t tiered_len, DbTableStats* stats)
6060
stats->tiered_used_bytes -= tiered_len;
6161
}
6262

63-
string DecodeString(bool is_raw, string_view str, PrimeValue decoder) {
64-
if (is_raw) {
65-
decoder.Materialize(str, true);
66-
string tmp;
67-
decoder.GetString(&tmp);
68-
return tmp;
69-
}
70-
return string{str};
71-
}
72-
7363
tiering::DiskSegment FromCoolItem(const PrimeValue::CoolItem& item) {
7464
return {item.record->page_index * tiering::kPageSize + item.page_offset, item.serialized_size};
7565
}
@@ -346,13 +336,9 @@ void TieredStorage::Read(DbIndex dbid, std::string_view key, const PrimeValue& v
346336
std::function<void(const std::string&)> readf) {
347337
DCHECK(value.IsExternal());
348338
DCHECK(!value.IsCool());
349-
350-
PrimeValue decoder;
351-
decoder.ImportExternal(value);
352-
353-
auto cb = [readf = std::move(readf), decoder = std::move(decoder)](
339+
auto cb = [readf = std::move(readf), enc = value.GetStrEncoding()](
354340
bool is_raw, const string* raw_val) mutable {
355-
readf(DecodeString(is_raw, *raw_val, std::move(decoder)));
341+
readf(is_raw ? enc.Decode(*raw_val).Take() : *raw_val);
356342
return false;
357343
};
358344
op_manager_->Enqueue(KeyRef(dbid, key), value.GetExternalSlice(), std::move(cb));
@@ -365,14 +351,11 @@ util::fb2::Future<T> TieredStorage::Modify(DbIndex dbid, std::string_view key,
365351
DCHECK(value.IsExternal());
366352

367353
util::fb2::Future<T> future;
368-
PrimeValue decoder;
369-
decoder.ImportExternal(value);
370-
371-
auto cb = [future, modf = std::move(modf), decoder = std::move(decoder)](
354+
auto cb = [future, modf = std::move(modf), enc = value.GetStrEncoding()](
372355
bool is_raw, std::string* raw_val) mutable {
373356
if (is_raw) {
374-
decoder.Materialize(*raw_val, true);
375-
decoder.GetString(raw_val);
357+
raw_val->resize(enc.DecodedSize(*raw_val));
358+
enc.Decode(*raw_val, raw_val->data());
376359
}
377360
future.Resolve(modf(raw_val));
378361
return true;

0 commit comments

Comments
 (0)