Skip to content

Commit 33bfac3

Browse files
chzhoozuiderkwast
andauthored
Optimize zset memory usage by embedding element in skiplist (#2508)
By default, when the number of elements in a zset exceeds 128, the underlying data structure adopts a skiplist. We can reduce memory usage by embedding elements into the skiplist nodes. Change the `zskiplistNode` memory layout as follows: ``` Before +-------------+ +-----> | element-sds | | +-------------+ | +------------------+-------+------------------+---------+-----+---------+ | element--pointer | score | backward-pointer | level-0 | ... | level-N | +------------------+-------+------------------+---------+-----+---------+ After +-------+------------------+---------+-----+---------+-------------+ + score | backward-pointer | level-0 | ... | level-N | element-sds | +-------+------------------+---------+-----+---------+-------------+ ``` Before the embedded SDS representation, we include one byte representing the size of the SDS header, i.e. the offset into the SDS representation where that actual string starts. The memory saving is therefore one pointer minus one byte = 7 bytes per element, regardless of other factors such as element size or number of elements. ### Benchmark step I generated the test data using the following lua script && cli command. And check memory usage using the `info` command. **lua script** ``` local start_idx = tonumber(ARGV[1]) local end_idx = tonumber(ARGV[2]) local elem_count = tonumber(ARGV[3]) for i = start_idx, end_idx do local key = "zset:" .. string.format("%012d", i) local members = {} for j = 0, elem_count - 1 do table.insert(members, j) table.insert(members, "member:" .. j) end redis.call("ZADD", key, unpack(members)) end return "OK: Created " .. (end_idx - start_idx + 1) .. " zsets" ``` **valkey-cli command** `valkey-cli EVAL "$(catcreate_zsets.lua)" 0 0 100000 ${ZSET_ELEMENT_NUM}` ### Benchmark result |number of elements in a zset | memory usage before optimization | memory usage after optimization | change | |-------|-------|-------|-------| | 129 | 1047MB | 943MB | -9.9% | | 256 | 2010MB| 1803MB| -10.3%| | 512 | 3904MB|3483MB| -10.8%| --------- Signed-off-by: chzhoo <[email protected]> Co-authored-by: Viktor Söderqvist <[email protected]>
1 parent 616fccb commit 33bfac3

File tree

15 files changed

+303
-86
lines changed

15 files changed

+303
-86
lines changed

src/aof.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1922,7 +1922,7 @@ int rewriteSortedSetObject(rio *r, robj *key, robj *o) {
19221922
return 0;
19231923
}
19241924
}
1925-
sds ele = node->ele;
1925+
sds ele = zslGetNodeElement(node);
19261926
if (!rioWriteBulkDouble(r, node->score) || !rioWriteBulkString(r, ele, sdslen(ele))) {
19271927
hashtableResetIterator(&iter);
19281928
return 0;

src/db.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1054,7 +1054,7 @@ void hashtableScanCallback(void *privdata, void *entry) {
10541054
key = (sds)entry;
10551055
} else if (o->type == OBJ_ZSET) {
10561056
zskiplistNode *node = (zskiplistNode *)entry;
1057-
key = node->ele;
1057+
key = zslGetNodeElement(node);
10581058
/* zset data is copied after filtering by key */
10591059
} else if (o->type == OBJ_HASH) {
10601060
key = entryGetField(entry);
@@ -1077,7 +1077,7 @@ void hashtableScanCallback(void *privdata, void *entry) {
10771077
if (o->type == OBJ_ZSET) {
10781078
/* zset data is copied */
10791079
zskiplistNode *node = (zskiplistNode *)entry;
1080-
key = sdsdup(node->ele);
1080+
key = sdsdup(zslGetNodeElement(node));
10811081
if (!data->only_keys) {
10821082
char buf[MAX_LONG_DOUBLE_CHARS];
10831083
int len = ld2string(buf, sizeof(buf), node->score, LD_STR_AUTO);

src/debug.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,8 @@ void xorObjectDigest(serverDb *db, robj *keyobj, unsigned char *digest, robj *o)
215215
const int len = fpconv_dtoa(node->score, buf);
216216
buf[len] = '\0';
217217
memset(eledigest, 0, 20);
218-
mixDigest(eledigest, node->ele, sdslen(node->ele));
218+
sds ele = zslGetNodeElement(node);
219+
mixDigest(eledigest, ele, sdslen(ele));
219220
mixDigest(eledigest, buf, strlen(buf));
220221
xorDigest(digest, eledigest, 20);
221222
}

src/defrag.c

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -266,12 +266,11 @@ static void activeDefragZsetNode(void *privdata, void *entry_ref) {
266266
zskiplistNode **node_ref = (zskiplistNode **)entry_ref;
267267
zskiplistNode *node = *node_ref;
268268

269-
/* defragment node internals */
270-
sds newsds = activeDefragSds(node->ele);
271-
if (newsds) node->ele = newsds;
269+
size_t allocation_size;
270+
zskiplistNode *newnode = activeDefragAllocWithoutFree(node, &allocation_size);
271+
if (newnode == NULL) return;
272272

273273
const double score = node->score;
274-
const sds ele = node->ele;
275274

276275
/* find skiplist pointers that need to be updated if we end up moving the
277276
* skiplist node. */
@@ -283,7 +282,7 @@ static void activeDefragZsetNode(void *privdata, void *entry_ref) {
283282
zskiplistNode *next = x->level[i].forward;
284283
while (next &&
285284
(next->score < score ||
286-
(next->score == score && sdscmp(next->ele, ele) < 0))) {
285+
(next->score == score && next != node))) {
287286
x = next;
288287
next = x->level[i].forward;
289288
}
@@ -292,12 +291,9 @@ static void activeDefragZsetNode(void *privdata, void *entry_ref) {
292291
/* should have arrived at intended node */
293292
serverAssert(x->level[0].forward == node);
294293

295-
/* try to defrag the skiplist record itself */
296-
zskiplistNode *newnode = activeDefragAlloc(node);
297-
if (newnode) {
298-
zslUpdateNode(zsl, node, newnode, update);
299-
*node_ref = newnode; /* update hashtable pointer */
300-
}
294+
zslUpdateNode(zsl, node, newnode, update);
295+
*node_ref = newnode; /* update hashtable pointer */
296+
allocatorDefragFree(node, allocation_size);
301297
}
302298

303299
#define DEFRAG_SDS_DICT_NO_VAL 0

src/geo.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,8 @@ int geoGetPointsInRange(robj *zobj, double min, double max, GeoShape *shape, geo
322322
if (!zslValueLteMax(ln->score, &range)) break;
323323
if (geoWithinShape(shape, ln->score, xy, &distance) == C_OK) {
324324
/* Append the new element. */
325-
geoArrayAppend(ga, xy, distance, ln->score, sdsdup(ln->ele));
325+
sds ele = zslGetNodeElement(ln);
326+
geoArrayAppend(ga, xy, distance, ln->score, sdsdup(ele));
326327
}
327328
if (ga->used && limit && ga->used >= limit) break;
328329
ln = ln->level[0].forward;
@@ -825,6 +826,7 @@ void georadiusGeneric(client *c, int srcKeyIndex, int flags) {
825826
totelelen += elelen;
826827
znode = zslInsert(zs->zsl, score, gp->member);
827828
serverAssert(hashtableAdd(zs->ht, znode));
829+
sdsfree(gp->member);
828830
gp->member = NULL;
829831
}
830832

src/module.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5165,7 +5165,8 @@ ValkeyModuleString *VM_ZsetRangeCurrentElement(ValkeyModuleKey *key, double *sco
51655165
} else if (key->value->encoding == OBJ_ENCODING_SKIPLIST) {
51665166
zskiplistNode *ln = key->u.zset.current;
51675167
if (score) *score = ln->score;
5168-
str = createStringObject(ln->ele, sdslen(ln->ele));
5168+
sds ele = zslGetNodeElement(ln);
5169+
str = createStringObject(ele, sdslen(ele));
51695170
} else {
51705171
serverPanic("Unsupported zset encoding");
51715172
}
@@ -5222,7 +5223,7 @@ int VM_ZsetRangeNext(ValkeyModuleKey *key) {
52225223
key->u.zset.er = 1;
52235224
return 0;
52245225
} else if (key->u.zset.type == VALKEYMODULE_ZSET_RANGE_LEX) {
5225-
if (!zslLexValueLteMax(next->ele, &key->u.zset.lrs)) {
5226+
if (!zslLexValueLteMax(zslGetNodeElement(next), &key->u.zset.lrs)) {
52265227
key->u.zset.er = 1;
52275228
return 0;
52285229
}
@@ -5284,7 +5285,7 @@ int VM_ZsetRangePrev(ValkeyModuleKey *key) {
52845285
key->u.zset.er = 1;
52855286
return 0;
52865287
} else if (key->u.zset.type == VALKEYMODULE_ZSET_RANGE_LEX) {
5287-
if (!zslLexValueGteMin(prev->ele, &key->u.zset.lrs)) {
5288+
if (!zslLexValueGteMin(zslGetNodeElement(prev), &key->u.zset.lrs)) {
52885289
key->u.zset.er = 1;
52895290
return 0;
52905291
}
@@ -11418,7 +11419,7 @@ static void moduleScanKeyHashtableCallback(void *privdata, void *entry) {
1141811419
/* no value */
1141911420
} else if (o->type == OBJ_ZSET) {
1142011421
zskiplistNode *node = (zskiplistNode *)entry;
11421-
key = node->ele;
11422+
key = zslGetNodeElement(node);
1142211423
value = createStringObjectFromLongDouble(node->score, 0);
1142311424
} else if (o->type == OBJ_HASH) {
1142411425
key = entryGetField(entry);

src/object.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -667,8 +667,9 @@ void dismissZsetObject(robj *o, size_t size_hint) {
667667
if (size_hint / zsl->length >= server.page_size) {
668668
zskiplistNode *zn = zsl->tail;
669669
while (zn != NULL) {
670-
dismissSds(zn->ele);
671-
zn = zn->backward;
670+
zskiplistNode *next = zn->backward;
671+
dismissMemory(zn, 0);
672+
zn = next;
672673
}
673674
}
674675

@@ -1190,7 +1191,6 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) {
11901191
asize += sizeof(zset) + sizeof(zskiplist) +
11911192
hashtableMemUsage(ht) + zmalloc_size(zsl->header);
11921193
while (znode != NULL && samples < sample_size) {
1193-
elesize += sdsAllocSize(znode->ele);
11941194
elesize += zmalloc_size(znode);
11951195
samples++;
11961196
znode = znode->level[0].forward;

src/rdb.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -959,7 +959,8 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) {
959959
* O(1) instead of O(log(N)). */
960960
zskiplistNode *zn = zsl->tail;
961961
while (zn != NULL) {
962-
if ((n = rdbSaveRawString(rdb, (unsigned char *)zn->ele, sdslen(zn->ele))) == -1) {
962+
sds ele = zslGetNodeElement(zn);
963+
if ((n = rdbSaveRawString(rdb, (unsigned char *)ele, sdslen(ele))) == -1) {
963964
return -1;
964965
}
965966
nwritten += n;
@@ -2095,6 +2096,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
20952096
totelelen += sdslen(sdsele);
20962097

20972098
znode = zslInsert(zs->zsl, score, sdsele);
2099+
sdsfree(sdsele);
20982100
if (!hashtableAdd(zs->ht, znode)) {
20992101
rdbReportCorruptRDB("Duplicate zset fields detected");
21002102
decrRefCount(o);

src/server.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,7 @@ hashtableType setHashtableType = {
561561

562562
const void *zsetHashtableGetKey(const void *element) {
563563
const zskiplistNode *node = element;
564-
return node->ele;
564+
return zslGetNodeElement(node);
565565
}
566566

567567
/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */

src/server.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1423,7 +1423,6 @@ struct sharedObjectsStruct {
14231423

14241424
/* ZSETs use a specialized version of Skiplists */
14251425
typedef struct zskiplistNode {
1426-
sds ele;
14271426
double score;
14281427
struct zskiplistNode *backward;
14291428
struct zskiplistLevel {
@@ -1434,6 +1433,7 @@ typedef struct zskiplistNode {
14341433
* So we use it in order to hold the height of the node, which is the number of levels. */
14351434
unsigned long span;
14361435
} level[];
1436+
/* After the level[], sds header length (1 byte) and an embedded sds element are stored. */
14371437
} zskiplistNode;
14381438

14391439
typedef struct zskiplist {
@@ -3275,8 +3275,9 @@ typedef struct {
32753275

32763276
zskiplist *zslCreate(void);
32773277
void zslFree(zskiplist *zsl);
3278-
zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele);
3278+
zskiplistNode *zslInsert(zskiplist *zsl, double score, const_sds ele);
32793279
zskiplistNode *zslNthInRange(zskiplist *zsl, zrangespec *range, long n, long *rank);
3280+
sds zslGetNodeElement(const zskiplistNode *x);
32803281
double zzlGetScore(unsigned char *sptr);
32813282
void zzlNext(unsigned char *zl, unsigned char **eptr, unsigned char **sptr);
32823283
void zzlPrev(unsigned char *zl, unsigned char **eptr, unsigned char **sptr);

0 commit comments

Comments
 (0)