Skip to content

Commit c18cb82

Browse files
authored
feat: implement ascii_unpack using SIMD instructions (dragonflydb#573)
Signed-off-by: Roman Gershman <[email protected]>
1 parent bcafd7e commit c18cb82

File tree

4 files changed

+117
-25
lines changed

4 files changed

+117
-25
lines changed

src/core/compact_object.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -736,7 +736,7 @@ string_view CompactObj::GetSlice(string* scratch) const {
736736
DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding());
737737
size_t decoded_len = DecodedLen(u_.r_obj.Size());
738738
scratch->resize(decoded_len);
739-
detail::ascii_unpack(to_byte(u_.r_obj.inner_obj()), decoded_len, scratch->data());
739+
detail::ascii_unpack_simd(to_byte(u_.r_obj.inner_obj()), decoded_len, scratch->data());
740740
} else if (taglen_ == SMALL_TAG) {
741741
size_t decoded_len = DecodedLen(u_.small_str.size());
742742
size_t space_left = decoded_len - u_.small_str.size();
@@ -749,8 +749,8 @@ string_view CompactObj::GetSlice(string* scratch) const {
749749
memcpy(next, slices[0].data(), slices[0].size());
750750
next += slices[0].size();
751751
memcpy(next, slices[1].data(), slices[1].size());
752-
detail::ascii_unpack(reinterpret_cast<uint8_t*>(scratch->data() + space_left), decoded_len,
753-
scratch->data());
752+
detail::ascii_unpack_simd(reinterpret_cast<uint8_t*>(scratch->data() + space_left),
753+
decoded_len, scratch->data());
754754
} else {
755755
LOG(FATAL) << "Unsupported tag " << int(taglen_);
756756
}
@@ -839,7 +839,7 @@ void CompactObj::GetString(char* dest) const {
839839
CHECK_EQ(OBJ_STRING, u_.r_obj.type());
840840
DCHECK_EQ(OBJ_ENCODING_RAW, u_.r_obj.encoding());
841841
size_t decoded_len = DecodedLen(u_.r_obj.Size());
842-
detail::ascii_unpack(to_byte(u_.r_obj.inner_obj()), decoded_len, dest);
842+
detail::ascii_unpack_simd(to_byte(u_.r_obj.inner_obj()), decoded_len, dest);
843843
} else if (taglen_ == SMALL_TAG) {
844844
size_t decoded_len = DecodedLen(u_.small_str.size());
845845

@@ -853,7 +853,7 @@ void CompactObj::GetString(char* dest) const {
853853
memcpy(next, slices[0].data(), slices[0].size());
854854
next += slices[0].size();
855855
memcpy(next, slices[1].data(), slices[1].size());
856-
detail::ascii_unpack(reinterpret_cast<uint8_t*>(dest + space_left), decoded_len, dest);
856+
detail::ascii_unpack_simd(reinterpret_cast<uint8_t*>(dest + space_left), decoded_len, dest);
857857
} else {
858858
LOG(FATAL) << "Unsupported tag " << int(taglen_);
859859
}

src/core/compact_object_test.cc

Lines changed: 62 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,20 +192,20 @@ TEST_F(CompactObjectTest, AsciiUtil) {
192192

193193
char outbuf[32] = "xxxxxxxxxxxxxx";
194194
detail::ascii_pack_simd(data.data(), 7, buf);
195-
detail::ascii_unpack(buf, 7, outbuf);
195+
detail::ascii_unpack_simd(buf, 7, outbuf);
196196

197197
ASSERT_EQ('x', outbuf[7]) << outbuf;
198198
std::string_view actual{outbuf, 7};
199199
ASSERT_EQ(data.substr(0, 7), actual);
200200

201201
string data3;
202-
for (unsigned i = 0; i < 97; ++i) {
202+
for (unsigned i = 0; i < 13; ++i) {
203203
data3.append("12345678910");
204204
}
205205
string act_str(data3.size(), 'y');
206206
std::vector<uint8_t> binvec(detail::binpacked_len(data3.size()));
207207
detail::ascii_pack_simd(data3.data(), data3.size(), binvec.data());
208-
detail::ascii_unpack(binvec.data(), data3.size(), act_str.data());
208+
detail::ascii_unpack_simd(binvec.data(), data3.size(), act_str.data());
209209

210210
ASSERT_EQ(data3, act_str);
211211
}
@@ -483,6 +483,29 @@ static void ascii_pack_naive(const char* ascii, size_t len, uint8_t* bin) {
483483
}
484484
}
485485

486+
static void ascii_unpack_naive(const uint8_t* bin, size_t ascii_len, char* ascii) {
487+
constexpr uint8_t kM = 0x7F;
488+
uint8_t p = 0;
489+
unsigned i = 0;
490+
491+
while (ascii_len >= 8) {
492+
for (i = 0; i < 7; ++i) {
493+
uint8_t src = *bin; // keep on stack in case we unpack inplace.
494+
*ascii++ = (p >> (8 - i)) | ((src << i) & kM);
495+
p = src;
496+
++bin;
497+
}
498+
499+
ascii_len -= 8;
500+
*ascii++ = p >> 1;
501+
}
502+
503+
DCHECK_LT(ascii_len, 8u);
504+
for (i = 0; i < ascii_len; ++i) {
505+
*ascii++ = *bin++;
506+
}
507+
}
508+
486509
static void BM_PackNaive(benchmark::State& state) {
487510
string val(1024, 'a');
488511
uint8_t buf[1024];
@@ -523,4 +546,40 @@ static void BM_PackSimd(benchmark::State& state) {
523546
}
524547
BENCHMARK(BM_PackSimd);
525548

549+
static void BM_UnpackNaive(benchmark::State& state) {
550+
string val(1024, 'a');
551+
uint8_t buf[1024];
552+
553+
detail::ascii_pack(val.data(), val.size(), buf);
554+
555+
while (state.KeepRunning()) {
556+
ascii_unpack_naive(buf, val.size(), val.data());
557+
}
558+
}
559+
BENCHMARK(BM_UnpackNaive);
560+
561+
static void BM_Unpack(benchmark::State& state) {
562+
string val(1024, 'a');
563+
uint8_t buf[1024];
564+
565+
detail::ascii_pack(val.data(), val.size(), buf);
566+
567+
while (state.KeepRunning()) {
568+
detail::ascii_unpack(buf, val.size(), val.data());
569+
}
570+
}
571+
BENCHMARK(BM_Unpack);
572+
573+
static void BM_UnpackSimd(benchmark::State& state) {
574+
string val(1024, 'a');
575+
uint8_t buf[1024];
576+
577+
detail::ascii_pack(val.data(), val.size(), buf);
578+
579+
while (state.KeepRunning()) {
580+
detail::ascii_unpack_simd(buf, val.size(), val.data());
581+
}
582+
}
583+
BENCHMARK(BM_UnpackSimd);
584+
526585
} // namespace dfly

src/core/detail/bitpacking.cc

Lines changed: 49 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,6 @@ void ascii_pack2(const char* ascii, size_t len, uint8_t* bin) {
9898

9999
// The algo - do in parallel what ascii_pack does on two uint64_t integers
100100
void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin) {
101-
__m128i val;
102-
103101
// I leave out 16 bytes in addition to 16 that we load in the loop
104102
// because we store into bin full 16 bytes instead of 14. To prevent data
105103
// overwrite we finish loop one iteration earlier.
@@ -108,7 +106,7 @@ void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin) {
108106
// Skips 8th byte (indexc 7) in the lower 8-byte part.
109107
const __m128i control = _mm_set_epi8(-1, -1, 14, 13, 12, 11, 10, 9, 8, 6, 5, 4, 3, 2, 1, 0);
110108

111-
__m128i rpart, lpart;
109+
__m128i val, rpart, lpart;
112110

113111
// Based on the question I asked here: https://stackoverflow.com/q/74831843/2280111
114112
while (ascii <= end) {
@@ -149,28 +147,62 @@ void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin) {
149147
// however, if binary data is positioned on the right of the ascii buffer with empty space on the
150148
// left than we can unpack inplace.
151149
void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii) {
152-
constexpr uint8_t kM = 0x7F;
153-
uint8_t p = 0;
154-
unsigned i = 0;
150+
uint64_t val;
155151

156-
while (ascii_len >= 8) {
157-
for (i = 0; i < 7; ++i) {
158-
uint8_t src = *bin; // keep on stack in case we unpack inplace.
159-
*ascii++ = (p >> (8 - i)) | ((src << i) & kM);
160-
p = src;
161-
++bin;
162-
}
152+
const char* end = ascii + ascii_len - 8;
153+
while (ascii <= end) {
154+
memcpy(&val, bin, 8);
155+
156+
val = ((val & 0x00FFFFFFF0000000) << 4) | (val & 0x000000000FFFFFFF);
157+
val = ((val & 0xFFFFC000FFFFC000) << 2) | (val & 0x00003FFF00003FFF);
158+
val = ((val & 0x7F807F807F807F80) << 1) | (val & 0x007F007F007F007F);
159+
memcpy(ascii, &val, 8);
163160

164-
ascii_len -= 8;
165-
*ascii++ = p >> 1;
161+
ascii += 8;
162+
bin += 7;
166163
}
167164

168-
DCHECK_LT(ascii_len, 8u);
169-
for (i = 0; i < ascii_len; ++i) {
165+
end += 8;
166+
while (ascii < end) {
170167
*ascii++ = *bin++;
171168
}
172169
}
173170

171+
void ascii_unpack_simd(const uint8_t* bin, size_t ascii_len, char* ascii) {
172+
__m128i val, rpart, lpart;
173+
174+
size_t round_down_len = (ascii_len & ~size_t(0x0F));
175+
const char* end = ascii + round_down_len;
176+
177+
// shifts the second 7-byte blob to the left.
178+
const __m128i control = _mm_set_epi8(14, 13, 12, 11, 10, 9, 8, 7, -1, 6, 5, 4, 3, 2, 1, 0);
179+
180+
while (ascii < end) {
181+
val = _mm_loadu_si128(reinterpret_cast<const __m128i*>(bin));
182+
val = _mm_shuffle_epi8(val, control);
183+
184+
rpart = _mm_and_si128(val, _mm_set1_epi64x(0x000000000FFFFFFF));
185+
lpart = _mm_and_si128(val, _mm_set1_epi64x(0x00FFFFFFF0000000));
186+
val = _mm_or_si128(_mm_slli_epi64(lpart, 4), rpart);
187+
188+
rpart = _mm_and_si128(val, _mm_set1_epi64x(0x00003FFF00003FFF));
189+
lpart = _mm_and_si128(val, _mm_set1_epi64x(0xFFFFC000FFFFC000));
190+
val = _mm_or_si128(_mm_slli_epi64(lpart, 2), rpart);
191+
192+
rpart = _mm_and_si128(val, _mm_set1_epi64x(0x007F007F007F007F));
193+
lpart = _mm_and_si128(val, _mm_set1_epi64x(0x7F807F807F807F80));
194+
val = _mm_or_si128(_mm_slli_epi64(lpart, 1), rpart);
195+
196+
_mm_storeu_si128(reinterpret_cast<__m128i*>(ascii), val);
197+
ascii += 16;
198+
bin += 14;
199+
}
200+
201+
ascii_len -= round_down_len;
202+
if (ascii_len)
203+
ascii_unpack(bin, ascii_len, ascii);
204+
}
205+
174206
// compares packed and unpacked strings. packed must be of length = binpacked_len(ascii_len).
175207
bool compare_packed(const uint8_t* packed, const char* ascii, size_t ascii_len) {
176208
unsigned i = 0;

src/core/detail/bitpacking.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ bool validate_ascii_fast(const char* src, size_t len);
1919
// however, if binary data is positioned on the right of the ascii buffer with empty space on the
2020
// left than we can unpack inplace.
2121
void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii);
22+
void ascii_unpack_simd(const uint8_t* bin, size_t ascii_len, char* ascii);
2223

2324
// packs ascii string (does not verify) into binary form saving 1 bit per byte on average (12.5%).
2425
void ascii_pack(const char* ascii, size_t len, uint8_t* bin);

0 commit comments

Comments
 (0)