Skip to content

Commit bcafd7e

Browse files
authored
feat: introduce simd algorithm for bitpacking (dragonflydb#568)
My benchmark shows a x3.5 improvement when compressing a 1KB string. Signed-off-by: Roman Gershman <[email protected]>
1 parent adc89c7 commit bcafd7e

File tree

8 files changed

+331
-142
lines changed

8 files changed

+331
-142
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,6 @@ jobs:
7777
ccache --show-stats
7878
echo Run ctest -V -L DFLY
7979
#GLOG_logtostderr=1 GLOG_vmodule=transaction=1,engine_shard_set=1
80-
GLOG_logtostderr=1 GLOG_vmodule=rdb_load=1,rdb_save=2,snapshot=2 ctest -V -L DFLY
80+
GLOG_logtostderr=1 GLOG_vmodule=rdb_load=1,rdb_save=1,snapshot=1 ctest -V -L DFLY
8181
./dragonfly_test --mem_defrag_threshold=0.05 # trying to catch issue with defrag
8282
# GLOG_logtostderr=1 GLOG_vmodule=transaction=1,engine_shard_set=1 CTEST_OUTPUT_ON_FAILURE=1 ninja server/test

src/core/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
add_library(dfly_core compact_object.cc dragonfly_core.cc extent_tree.cc
22
external_alloc.cc interpreter.cc json_object.cc mi_memory_resource.cc
3-
segment_allocator.cc small_string.cc tx_queue.cc dense_set.cc string_set.cc)
3+
segment_allocator.cc small_string.cc tx_queue.cc dense_set.cc string_set.cc
4+
detail/bitpacking.cc)
5+
46
cxx_link(dfly_core base absl::flat_hash_map absl::str_format redis_lib TRDP::lua lua_modules
57
Boost::fiber TRDP::jsoncons crypto)
68

src/core/compact_object.cc

Lines changed: 6 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -23,19 +23,15 @@ extern "C" {
2323
#include "base/flags.h"
2424
#include "base/logging.h"
2525
#include "base/pod_array.h"
26+
#include "core/detail/bitpacking.h"
2627
#include "core/string_set.h"
2728

28-
#if defined(__aarch64__)
29-
#include "base/sse2neon.h"
30-
#else
31-
#include <emmintrin.h>
32-
#endif
33-
3429
ABSL_FLAG(bool, use_set2, true, "If true use DenseSet for an optimized set data structure");
3530

3631
namespace dfly {
3732
using namespace std;
3833
using absl::GetFlag;
34+
using detail::binpacked_len;
3935

4036
namespace {
4137

@@ -154,35 +150,6 @@ inline void FreeObjStream(void* ptr) {
154150
freeStream((stream*)ptr);
155151
}
156152

157-
// Daniel Lemire's function validate_ascii_fast() - under Apache/MIT license.
158-
// See https://github.com/lemire/fastvalidate-utf-8/
159-
// The function returns true (1) if all chars passed in src are
160-
// 7-bit values (0x00..0x7F). Otherwise, it returns false (0).
161-
bool validate_ascii_fast(const char* src, size_t len) {
162-
size_t i = 0;
163-
__m128i has_error = _mm_setzero_si128();
164-
if (len >= 16) {
165-
for (; i <= len - 16; i += 16) {
166-
__m128i current_bytes = _mm_loadu_si128((const __m128i*)(src + i));
167-
has_error = _mm_or_si128(has_error, current_bytes);
168-
}
169-
}
170-
int error_mask = _mm_movemask_epi8(has_error);
171-
172-
char tail_has_error = 0;
173-
for (; i < len; i++) {
174-
tail_has_error |= src[i];
175-
}
176-
error_mask |= (tail_has_error & 0x80);
177-
178-
return !error_mask;
179-
}
180-
181-
// maps ascii len to 7-bit packed length. Each 8 bytes are converted to 7 bytes.
182-
inline constexpr size_t binpacked_len(size_t ascii_len) {
183-
return (ascii_len * 7 + 7) / 8; /* rounded up */
184-
}
185-
186153
// converts 7-bit packed length back to ascii length. Note that this conversion
187154
// is not accurate since it maps 7 bytes to 8 bytes (rounds up), while we may have
188155
// 7 byte strings converted to 7 byte as well.
@@ -428,91 +395,6 @@ void RobjWrapper::MakeInnerRoom(size_t current_cap, size_t desired, pmr::memory_
428395
inner_obj_ = newp;
429396
}
430397

431-
#if defined(__GNUC__) && !defined(__clang__)
432-
#pragma GCC push_options
433-
#pragma GCC optimize("Ofast")
434-
#endif
435-
436-
// len must be at least 16
437-
void ascii_pack(const char* ascii, size_t len, uint8_t* bin) {
438-
const char* end = ascii + len;
439-
440-
unsigned i = 0;
441-
while (ascii + 8 <= end) {
442-
for (i = 0; i < 7; ++i) {
443-
*bin++ = (ascii[0] >> i) | (ascii[1] << (7 - i));
444-
++ascii;
445-
}
446-
++ascii;
447-
}
448-
449-
// epilog - we do not pack since we have less than 8 bytes.
450-
while (ascii < end) {
451-
*bin++ = *ascii++;
452-
}
453-
}
454-
455-
// unpacks 8->7 encoded blob back to ascii.
456-
// generally, we can not unpack inplace because ascii (dest) buffer is 8/7 bigger than
457-
// the source buffer.
458-
// however, if binary data is positioned on the right of the ascii buffer with empty space on the
459-
// left than we can unpack inplace.
460-
void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii) {
461-
constexpr uint8_t kM = 0x7F;
462-
uint8_t p = 0;
463-
unsigned i = 0;
464-
465-
while (ascii_len >= 8) {
466-
for (i = 0; i < 7; ++i) {
467-
uint8_t src = *bin; // keep on stack in case we unpack inplace.
468-
*ascii++ = (p >> (8 - i)) | ((src << i) & kM);
469-
p = src;
470-
++bin;
471-
}
472-
473-
ascii_len -= 8;
474-
*ascii++ = p >> 1;
475-
}
476-
477-
DCHECK_LT(ascii_len, 8u);
478-
for (i = 0; i < ascii_len; ++i) {
479-
*ascii++ = *bin++;
480-
}
481-
}
482-
483-
// compares packed and unpacked strings. packed must be of length = binpacked_len(ascii_len).
484-
bool compare_packed(const uint8_t* packed, const char* ascii, size_t ascii_len) {
485-
unsigned i = 0;
486-
bool res = true;
487-
const char* end = ascii + ascii_len;
488-
489-
while (ascii + 8 <= end) {
490-
for (i = 0; i < 7; ++i) {
491-
uint8_t conv = (ascii[0] >> i) | (ascii[1] << (7 - i));
492-
res &= (conv == *packed);
493-
++ascii;
494-
++packed;
495-
}
496-
497-
if (!res)
498-
return false;
499-
500-
++ascii;
501-
}
502-
503-
while (ascii < end) {
504-
if (*ascii++ != *packed++) {
505-
return false;
506-
}
507-
}
508-
509-
return true;
510-
}
511-
512-
#if defined(__GNUC__) && !defined(__clang__)
513-
#pragma GCC pop_options
514-
#endif
515-
516398
} // namespace detail
517399

518400
using namespace std;
@@ -777,7 +659,7 @@ void CompactObj::SetString(std::string_view str) {
777659
DCHECK_GT(str.size(), kInlineLen);
778660

779661
string_view encoded = str;
780-
bool is_ascii = kUseAsciiEncoding && validate_ascii_fast(str.data(), str.size());
662+
bool is_ascii = kUseAsciiEncoding && detail::validate_ascii_fast(str.data(), str.size());
781663

782664
if (is_ascii) {
783665
size_t encode_len = binpacked_len(str.size());
@@ -792,7 +674,7 @@ void CompactObj::SetString(std::string_view str) {
792674
}
793675

794676
tl.tmp_buf.resize(encode_len);
795-
detail::ascii_pack(str.data(), str.size(), tl.tmp_buf.data());
677+
detail::ascii_pack_simd(str.data(), str.size(), tl.tmp_buf.data());
796678
encoded = string_view{reinterpret_cast<char*>(tl.tmp_buf.data()), encode_len};
797679

798680
if (encoded.size() <= kInlineLen) {
@@ -1125,7 +1007,7 @@ bool CompactObj::CmpEncoded(string_view sv) const {
11251007
if (u_.r_obj.Size() != encode_len)
11261008
return false;
11271009

1128-
if (!validate_ascii_fast(sv.data(), sv.size()))
1010+
if (!detail::validate_ascii_fast(sv.data(), sv.size()))
11291011
return false;
11301012

11311013
return detail::compare_packed(to_byte(u_.r_obj.inner_obj()), sv.data(), sv.size());
@@ -1139,7 +1021,7 @@ bool CompactObj::CmpEncoded(string_view sv) const {
11391021
if (u_.small_str.size() != encode_len)
11401022
return false;
11411023

1142-
if (!validate_ascii_fast(sv.data(), sv.size()))
1024+
if (!detail::validate_ascii_fast(sv.data(), sv.size()))
11431025
return false;
11441026

11451027
// We need to compare an unpacked sv with 2 packed parts.

src/core/compact_object.h

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -76,16 +76,6 @@ class RobjWrapper {
7676

7777
} __attribute__((packed));
7878

79-
// unpacks 8->7 encoded blob back to ascii.
80-
// generally, we can not unpack inplace because ascii (dest) buffer is 8/7 bigger than
81-
// the source buffer.
82-
// however, if binary data is positioned on the right of the ascii buffer with empty space on the
83-
// left than we can unpack inplace.
84-
void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii);
85-
86-
// packs ascii string (does not verify) into binary form saving 1 bit per byte on average (12.5%).
87-
void ascii_pack(const char* ascii, size_t len, uint8_t* bin);
88-
8979
} // namespace detail
9080

9181
class CompactObj {

src/core/compact_object_test.cc

Lines changed: 75 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
#include "base/gtest.h"
1414
#include "base/logging.h"
15+
#include "core/detail/bitpacking.h"
1516
#include "core/flat_set.h"
1617
#include "core/json_object.h"
1718
#include "core/mi_memory_resource.h"
@@ -189,13 +190,24 @@ TEST_F(CompactObjectTest, AsciiUtil) {
189190
std::string_view data{"aaaaaabb"};
190191
uint8_t buf[32];
191192

192-
char ascii2[] = "xxxxxxxxxxxxxx";
193-
detail::ascii_pack(data.data(), 7, buf);
194-
detail::ascii_unpack(buf, 7, ascii2);
193+
char outbuf[32] = "xxxxxxxxxxxxxx";
194+
detail::ascii_pack_simd(data.data(), 7, buf);
195+
detail::ascii_unpack(buf, 7, outbuf);
195196

196-
ASSERT_EQ('x', ascii2[7]) << ascii2;
197-
std::string_view actual{ascii2, 7};
197+
ASSERT_EQ('x', outbuf[7]) << outbuf;
198+
std::string_view actual{outbuf, 7};
198199
ASSERT_EQ(data.substr(0, 7), actual);
200+
201+
string data3;
202+
for (unsigned i = 0; i < 97; ++i) {
203+
data3.append("12345678910");
204+
}
205+
string act_str(data3.size(), 'y');
206+
std::vector<uint8_t> binvec(detail::binpacked_len(data3.size()));
207+
detail::ascii_pack_simd(data3.data(), data3.size(), binvec.data());
208+
detail::ascii_unpack(binvec.data(), data3.size(), act_str.data());
209+
210+
ASSERT_EQ(data3, act_str);
199211
}
200212

201213
TEST_F(CompactObjectTest, IntSet) {
@@ -453,4 +465,62 @@ TEST_F(CompactObjectTest, JsonTypeWithPathTest) {
453465
}
454466
}
455467

468+
static void ascii_pack_naive(const char* ascii, size_t len, uint8_t* bin) {
469+
const char* end = ascii + len;
470+
471+
unsigned i = 0;
472+
while (ascii + 8 <= end) {
473+
for (i = 0; i < 7; ++i) {
474+
*bin++ = (ascii[0] >> i) | (ascii[1] << (7 - i));
475+
++ascii;
476+
}
477+
++ascii;
478+
}
479+
480+
// epilog - we do not pack since we have less than 8 bytes.
481+
while (ascii < end) {
482+
*bin++ = *ascii++;
483+
}
484+
}
485+
486+
static void BM_PackNaive(benchmark::State& state) {
487+
string val(1024, 'a');
488+
uint8_t buf[1024];
489+
490+
while (state.KeepRunning()) {
491+
ascii_pack_naive(val.data(), val.size(), buf);
492+
}
493+
}
494+
BENCHMARK(BM_PackNaive);
495+
496+
static void BM_Pack(benchmark::State& state) {
497+
string val(1024, 'a');
498+
uint8_t buf[1024];
499+
500+
while (state.KeepRunning()) {
501+
detail::ascii_pack(val.data(), val.size(), buf);
502+
}
503+
}
504+
BENCHMARK(BM_Pack);
505+
506+
static void BM_Pack2(benchmark::State& state) {
507+
string val(1024, 'a');
508+
uint8_t buf[1024];
509+
510+
while (state.KeepRunning()) {
511+
detail::ascii_pack(val.data(), val.size(), buf);
512+
}
513+
}
514+
BENCHMARK(BM_Pack2);
515+
516+
static void BM_PackSimd(benchmark::State& state) {
517+
string val(1024, 'a');
518+
uint8_t buf[1024];
519+
520+
while (state.KeepRunning()) {
521+
detail::ascii_pack_simd(val.data(), val.size(), buf);
522+
}
523+
}
524+
BENCHMARK(BM_PackSimd);
525+
456526
} // namespace dfly

0 commit comments

Comments
 (0)