Skip to content

Commit 6d7a22d

Browse files
committed
decoder: Deduplicate keys also between multiple decoding, reduce max string length to 64 chars
1 parent 95e8d15 commit 6d7a22d

File tree

5 files changed

+120
-59
lines changed

5 files changed

+120
-59
lines changed

php_simdjson.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,16 @@ extern zend_module_entry simdjson_module_entry;
6262
#define SIMDJSON_SUPPORT_URL "https://github.com/JakubOnderka/simdjson_php"
6363

6464
#define SIMDJSON_PARSE_DEFAULT_DEPTH 512
65-
66-
/*
65+
/**
6766
* Number of strings in array of array or object keys that will be deduplicated
6867
*/
69-
#define SIMDJSON_REPEATED_STRINGS_COUNT 128
68+
#define SIMDJSON_DEDUP_STRING_COUNT 256
69+
/**
70+
* Maximum length of strings to be considered for deduplication.
71+
* Longer strings are less likely to be duplicated and the memory overhead
72+
* of storing them in the hash table might exceed the benefits.
73+
*/
74+
#define SIMDJSON_MAX_DEDUP_LENGTH 64
7075

7176
/*
7277
* NOTE: Namespaces and references(&) are C++ only functionality.

src/simdjson_decoder.cpp

Lines changed: 67 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -211,19 +211,34 @@ static zend_always_inline Bucket *simdjson_zend_hash_str_find_bucket(const HashT
211211
return NULL;
212212
}
213213

214-
static zend_always_inline void simdjson_init_reused_key_strings(HashTable *repeated_key_strings) {
215-
if (UNEXPECTED(repeated_key_strings->nTableSize == 0)) { // array is not initialized yet
216-
zend_hash_init(repeated_key_strings, SIMDJSON_REPEATED_STRINGS_COUNT, NULL, NULL, 0);
217-
zend_hash_real_init_mixed(repeated_key_strings);
218-
}
214+
static zend_always_inline void simdjson_release_reused_key_strings(HashTable *dedup_key_strings) {
215+
ZEND_ASSERT(dedup_key_strings->nNumUsed > 0);
216+
Bucket *p = dedup_key_strings->arData;
217+
Bucket *end = p + dedup_key_strings->nNumUsed;
218+
do {
219+
if (GC_DELREF(p->key) == 0) {
220+
ZEND_ASSERT(!(GC_FLAGS(p->key) & IS_STR_PERSISTENT));
221+
efree(p->key);
222+
}
223+
} while (++p != end);
219224
}
220225

221-
static zend_always_inline void simdjson_clean_reused_key_strings(HashTable *repeated_key_strings) {
222-
ZEND_ASSERT(repeated_key_strings->nTableMask == HT_SIZE_TO_MASK(SIMDJSON_REPEATED_STRINGS_COUNT));
223-
if (repeated_key_strings->nNumUsed) {
224-
HT_HASH_RESET(repeated_key_strings);
225-
repeated_key_strings->nNumUsed = 0;
226-
repeated_key_strings->nNumOfElements = 0;
226+
static zend_always_inline void simdjson_init_reused_key_strings(HashTable *dedup_key_strings) {
227+
if (UNEXPECTED(dedup_key_strings->nTableSize == 0)) {
228+
// hash table is not initialized yet
229+
zend_hash_init(dedup_key_strings, SIMDJSON_DEDUP_STRING_COUNT, NULL, NULL, 0);
230+
// zend_hash_real_init_mixed
231+
void * data = emalloc(HT_SIZE_EX(SIMDJSON_DEDUP_STRING_COUNT, HT_SIZE_TO_MASK(SIMDJSON_DEDUP_STRING_COUNT)));
232+
dedup_key_strings->nTableMask = HT_SIZE_TO_MASK(SIMDJSON_DEDUP_STRING_COUNT);
233+
HT_SET_DATA_ADDR(dedup_key_strings, data);
234+
HT_HASH_RESET(dedup_key_strings);
235+
} else if (dedup_key_strings->nNumUsed > SIMDJSON_DEDUP_STRING_COUNT / 2) {
236+
// more than half of hash table is already full, cleanup
237+
simdjson_release_reused_key_strings(dedup_key_strings);
238+
ZEND_ASSERT(dedup_key_strings->nTableMask == HT_SIZE_TO_MASK(SIMDJSON_DEDUP_STRING_COUNT));
239+
HT_HASH_RESET(dedup_key_strings);
240+
dedup_key_strings->nNumUsed = 0;
241+
dedup_key_strings->nNumOfElements = 0;
227242
}
228243
}
229244

@@ -232,21 +247,26 @@ static zend_always_inline void simdjson_clean_reused_key_strings(HashTable *repe
232247
* This method check if key was already used in same JSON document and returns a reference or allocate new string if
233248
* is unique
234249
*/
235-
static zend_always_inline zend_string* simdjson_reuse_key(HashTable *ht, const char *str, size_t len, zend_ulong h) {
250+
static zend_always_inline zend_string* simdjson_dedup_key(HashTable *ht, const char *str, size_t len, zend_ulong h) {
236251
uint32_t nIndex;
237252
uint32_t idx;
238253
Bucket *p;
239254
zend_string *key;
240255

256+
if (len > SIMDJSON_MAX_DEDUP_LENGTH) {
257+
goto init_new_string;
258+
}
259+
241260
// This should make computation faster, as we know array size
242261
ZEND_ASSERT(ht != NULL);
243-
ZEND_ASSERT(ht->nTableMask == HT_SIZE_TO_MASK(SIMDJSON_REPEATED_STRINGS_COUNT));
262+
ZEND_ASSERT(ht->nTableMask == HT_SIZE_TO_MASK(SIMDJSON_DEDUP_STRING_COUNT));
244263

245264
p = simdjson_zend_hash_str_find_bucket(ht, str, len, h);
246265
if (p) { // Key already exists, reuse
247266
GC_ADDREF(p->key); // raise reference counter by one
248267
return p->key;
249-
} else if (UNEXPECTED(ht->nNumUsed >= SIMDJSON_REPEATED_STRINGS_COUNT)) { // hashtable is full
268+
} else if (UNEXPECTED(ht->nNumUsed >= SIMDJSON_DEDUP_STRING_COUNT)) { // hashtable is full
269+
init_new_string:
250270
key = simdjson_string_init(str, len); // always return new string if hashtable is full
251271
ZSTR_H(key) = h; // set hash to zend_string
252272
return key;
@@ -255,6 +275,7 @@ static zend_always_inline zend_string* simdjson_reuse_key(HashTable *ht, const c
255275
ht->nNumOfElements++;
256276
p = ht->arData + idx;
257277
p->key = simdjson_string_init(str, len); // initialize new string for key
278+
GC_ADDREF(p->key); // raise gc counter by one, so it will be 2
258279
p->h = ZSTR_H(p->key) = h;
259280
//ZVAL_NULL(&p->val); // we dont need set value to null, as we don't use it and destructor is set to NULL
260281
nIndex = h | ht->nTableMask;
@@ -271,7 +292,7 @@ static zend_always_inline zend_string* simdjson_reuse_key(HashTable *ht, const c
271292
* - initialized array as zend_hash_real_init_mixed
272293
* - exact size must be known in advance
273294
*/
274-
static zend_always_inline void simdjson_zend_hash_str_add_or_update(HashTable *ht, const char *str, size_t len, zval *pData, HashTable *repeated_key_strings) {
295+
static zend_always_inline void simdjson_zend_hash_str_add_or_update(HashTable *ht, const char *str, size_t len, zval *pData, HashTable *dedup_key_strings) {
275296
uint32_t nIndex;
276297
uint32_t idx;
277298
Bucket *p;
@@ -297,7 +318,7 @@ static zend_always_inline void simdjson_zend_hash_str_add_or_update(HashTable *h
297318
idx = ht->nNumUsed++;
298319
ht->nNumOfElements++;
299320
p = ht->arData + idx;
300-
p->key = simdjson_reuse_key(repeated_key_strings, str, len, h); // initialize new string for key
321+
p->key = simdjson_dedup_key(dedup_key_strings, str, len, h); // initialize new string for key
301322
//p->key = simdjson_string_init(str, len);
302323
p->h = /* ZSTR_H(p->key) =*/ h;
303324
HT_FLAGS(ht) &= ~HASH_FLAG_STATIC_KEYS;
@@ -309,7 +330,7 @@ static zend_always_inline void simdjson_zend_hash_str_add_or_update(HashTable *h
309330
}
310331
#endif // PHP_VERSION_ID >= 80200
311332

312-
static zend_always_inline void simdjson_add_key_to_symtable(HashTable *ht, const char *buf, size_t len, zval *value, HashTable *repeated_key_strings) {
333+
static zend_always_inline void simdjson_add_key_to_symtable(HashTable *ht, const char *buf, size_t len, zval *value, HashTable *dedup_key_strings) {
313334
#if PHP_VERSION_ID >= 80200
314335
zend_ulong idx;
315336
if (UNEXPECTED(ZEND_HANDLE_NUMERIC_STR(buf, len, idx))) {
@@ -319,7 +340,7 @@ static zend_always_inline void simdjson_add_key_to_symtable(HashTable *ht, const
319340
zend_string *key = len == 1 ? ZSTR_CHAR((unsigned char)buf[0]) : ZSTR_EMPTY_ALLOC();
320341
zend_hash_update(ht, key, value);
321342
} else {
322-
simdjson_zend_hash_str_add_or_update(ht, buf, len, value, repeated_key_strings);
343+
simdjson_zend_hash_str_add_or_update(ht, buf, len, value, dedup_key_strings);
323344
}
324345
#else
325346
if (len <= 1) {
@@ -347,7 +368,7 @@ static zend_always_inline void simdjson_set_zval_to_int64(zval *zv, int64_t valu
347368
ZVAL_LONG(zv, value);
348369
}
349370

350-
static void simdjson_create_array(simdjson::dom::element element, zval *return_value, HashTable *repeated_key_strings) {
371+
static void simdjson_create_array(simdjson::dom::element element, zval *return_value, HashTable *dedup_key_strings) {
351372
switch (element.type()) {
352373
//ASCII sort
353374
case simdjson::dom::element_type::STRING :
@@ -380,7 +401,7 @@ static void simdjson_create_array(simdjson::dom::element element, zval *return_v
380401
zend_array *arr = simdjson_init_packed_array(return_value, 0xFFFFFF);
381402
for (simdjson::dom::element child : json_array) {
382403
zval array_element;
383-
simdjson_create_array(child, &array_element, repeated_key_strings);
404+
simdjson_create_array(child, &array_element, dedup_key_strings);
384405
zend_hash_next_index_insert_new(arr, &array_element);
385406
}
386407
break;
@@ -391,14 +412,14 @@ static void simdjson_create_array(simdjson::dom::element element, zval *return_v
391412
/* Optimised variant of adding elements to array with known size available since PHP 8.2 */
392413
ZEND_HASH_FILL_PACKED(arr) {
393414
for (simdjson::dom::element child : json_array) {
394-
simdjson_create_array(child, __fill_val, repeated_key_strings);
415+
simdjson_create_array(child, __fill_val, dedup_key_strings);
395416
ZEND_HASH_FILL_NEXT();
396417
}
397418
} ZEND_HASH_FILL_END();
398419
#else
399420
for (simdjson::dom::element child : json_array) {
400421
zval array_element;
401-
simdjson_create_array(child, &array_element, repeated_key_strings);
422+
simdjson_create_array(child, &array_element, dedup_key_strings);
402423
zend_hash_next_index_insert_new(arr, &array_element);
403424
}
404425
#endif
@@ -412,16 +433,12 @@ static void simdjson_create_array(simdjson::dom::element element, zval *return_v
412433
break;
413434
}
414435

415-
#if PHP_VERSION_ID >= 80200
416-
// Allocate table for reusing already allocated keys
417-
simdjson_init_reused_key_strings(repeated_key_strings);
418-
#endif
419436
HashTable *ht = simdjson_init_mixed_array(return_value, json_object.size());
420437

421438
for (simdjson::dom::key_value_pair field : json_object) {
422439
zval array_element;
423-
simdjson_create_array(field.value, &array_element, repeated_key_strings);
424-
simdjson_add_key_to_symtable(ht, field.key.data(), field.key.size(), &array_element, repeated_key_strings);
440+
simdjson_create_array(field.value, &array_element, dedup_key_strings);
441+
simdjson_add_key_to_symtable(ht, field.key.data(), field.key.size(), &array_element, dedup_key_strings);
425442
}
426443
break;
427444
}
@@ -431,7 +448,7 @@ static void simdjson_create_array(simdjson::dom::element element, zval *return_v
431448

432449
/* }}} */
433450

434-
static simdjson_php_error_code simdjson_create_object(simdjson::dom::element element, zval *return_value, HashTable *repeated_key_strings) /* {{{ */ {
451+
static simdjson_php_error_code simdjson_create_object(simdjson::dom::element element, zval *return_value, HashTable *dedup_key_strings) /* {{{ */ {
435452
switch (element.type()) {
436453
//ASCII sort
437454
case simdjson::dom::element_type::STRING :
@@ -465,7 +482,7 @@ static simdjson_php_error_code simdjson_create_object(simdjson::dom::element ele
465482

466483
for (simdjson::dom::element child : json_array) {
467484
zval value;
468-
simdjson_php_error_code error = simdjson_create_object(child, &value, repeated_key_strings);
485+
simdjson_php_error_code error = simdjson_create_object(child, &value, dedup_key_strings);
469486
if (UNEXPECTED(error)) {
470487
zval_ptr_dtor(return_value);
471488
ZVAL_NULL(return_value);
@@ -478,10 +495,6 @@ static simdjson_php_error_code simdjson_create_object(simdjson::dom::element ele
478495
case simdjson::dom::element_type::OBJECT : {
479496
const auto json_object = element.get_object().value_unsafe();
480497
zend_object *obj = simdjson_init_object(return_value, json_object.size());
481-
#if PHP_VERSION_ID >= 80200
482-
// Allocate table for reusing already allocated keys
483-
simdjson_init_reused_key_strings(repeated_key_strings);
484-
#endif
485498

486499
for (simdjson::dom::key_value_pair field : json_object) {
487500
const char *data = field.key.data();
@@ -493,7 +506,7 @@ static simdjson_php_error_code simdjson_create_object(simdjson::dom::element ele
493506
return SIMDJSON_PHP_ERR_INVALID_PHP_PROPERTY;
494507
}
495508
zval value;
496-
simdjson_php_error_code error = simdjson_create_object(field.value, &value, repeated_key_strings);
509+
simdjson_php_error_code error = simdjson_create_object(field.value, &value, dedup_key_strings);
497510
if (UNEXPECTED(error)) {
498511
zval_ptr_dtor(return_value);
499512
ZVAL_NULL(return_value);
@@ -507,7 +520,7 @@ static simdjson_php_error_code simdjson_create_object(simdjson::dom::element ele
507520
} else {
508521
#if PHP_VERSION_ID >= 80200
509522
zend_ulong h = zend_inline_hash_func(data, size);
510-
key = simdjson_reuse_key(repeated_key_strings, data, size, h);
523+
key = simdjson_dedup_key(dedup_key_strings, data, size, h);
511524
#else
512525
key = simdjson_string_init(data, size);
513526
#endif
@@ -530,27 +543,30 @@ PHP_SIMDJSON_API simdjson_php_parser* php_simdjson_create_parser(void) /* {{{ */
530543
}
531544

532545
PHP_SIMDJSON_API void php_simdjson_free_parser(simdjson_php_parser* parser) /* {{{ */ {
533-
// Destroy repeated_key_strings hash if was allocated
534-
if (parser->repeated_key_strings.nTableSize) {
535-
efree(HT_GET_DATA_ADDR(&parser->repeated_key_strings));
546+
#if PHP_VERSION_ID >= 80200
547+
// Destroy dedup_key_strings hash if was allocated
548+
if (parser->dedup_key_strings.nTableSize) {
549+
if (parser->dedup_key_strings.nNumUsed) {
550+
simdjson_release_reused_key_strings(&parser->dedup_key_strings);
551+
}
552+
efree(HT_GET_DATA_ADDR(&parser->dedup_key_strings));
536553
}
554+
#endif
537555
delete parser;
538556
}
539557

540-
static zend_always_inline simdjson_php_error_code simdjson_convert_element(simdjson::dom::element element, zval *return_value, bool associative, HashTable *repeated_key_strings) {
558+
static simdjson_php_error_code simdjson_convert_element(simdjson::dom::element element, zval *return_value, bool associative, HashTable *dedup_key_strings) {
559+
#if PHP_VERSION_ID >= 80200
560+
// Allocate table for reusing already allocated keys
561+
simdjson_init_reused_key_strings(dedup_key_strings);
562+
#endif
541563
simdjson_php_error_code resp;
542564
if (associative) {
543-
simdjson_create_array(element, return_value, repeated_key_strings);
565+
simdjson_create_array(element, return_value, dedup_key_strings);
544566
resp = simdjson::SUCCESS;
545567
} else {
546-
resp = simdjson_create_object(element, return_value, repeated_key_strings);
547-
}
548-
#if PHP_VERSION_ID >= 80200
549-
// Cleanup table if repeated_key_strings hashtable was initialized
550-
if (repeated_key_strings->nTableSize != 0) {
551-
simdjson_clean_reused_key_strings(repeated_key_strings);
568+
resp = simdjson_create_object(element, return_value, dedup_key_strings);
552569
}
553-
#endif
554570
return resp;
555571
}
556572

@@ -566,14 +582,14 @@ PHP_SIMDJSON_API simdjson_php_error_code php_simdjson_parse(simdjson_php_parser*
566582
simdjson::dom::element doc;
567583

568584
SIMDJSON_PHP_TRY(build_parsed_json_cust(parser, doc, ZSTR_VAL(json), ZSTR_LEN(json), simdjson_realloc_needed(json), depth));
569-
return simdjson_convert_element(doc, return_value, associative, &parser->repeated_key_strings);
585+
return simdjson_convert_element(doc, return_value, associative, &parser->dedup_key_strings);
570586
}
571587

572588
PHP_SIMDJSON_API simdjson_php_error_code php_simdjson_parse_buffer(simdjson_php_parser* parser, const char *json, size_t len, zval *return_value, bool associative, size_t depth) /* {{{ */ {
573589
simdjson::dom::element doc;
574590

575591
SIMDJSON_PHP_TRY(build_parsed_json_cust(parser, doc, json, len, false, depth));
576-
return simdjson_convert_element(doc, return_value, associative, &parser->repeated_key_strings);
592+
return simdjson_convert_element(doc, return_value, associative, &parser->dedup_key_strings);
577593
}
578594

579595
/* }}} */
@@ -583,7 +599,7 @@ PHP_SIMDJSON_API simdjson_php_error_code php_simdjson_key_value(simdjson_php_par
583599
simdjson::dom::element element;
584600
SIMDJSON_PHP_TRY(build_parsed_json_cust(parser, doc, ZSTR_VAL(json), ZSTR_LEN(json), simdjson_realloc_needed(json), depth));
585601
SIMDJSON_PHP_TRY(get_key_with_optional_prefix(doc, key).get(element));
586-
return simdjson_convert_element(element, return_value, associative, &parser->repeated_key_strings);
602+
return simdjson_convert_element(element, return_value, associative, &parser->dedup_key_strings);
587603
}
588604

589605
/* }}} */

src/simdjson_decoder_defs.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
struct simdjson_php_parser {
2323
public:
2424
simdjson::dom::parser parser;
25-
HashTable repeated_key_strings;
25+
HashTable dedup_key_strings;
2626
};
2727

2828
#endif

tests/decode_repeated.phpt

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ if (PHP_VERSION_ID < 80200) echo "skip deduplication is supported since PHP 8.2\
66
?>
77
--FILE--
88
<?php
9-
$json = '[{"ahoj":"svete"},{"ahoj":"moravo"}]';
9+
$json = '[{"ahoj":"svete"},{"ahoj":"moravo"},{"very_long_key_that_will_not_be_deduplicated_by_simdjson_extension":true},{"very_long_key_that_will_not_be_deduplicated_by_simdjson_extension":true}]';
1010

1111
$value = \json_decode($json, true);
1212
debug_zval_dump(array_key_first($value[0]));
@@ -19,8 +19,24 @@ debug_zval_dump(@key($value[0]));
1919

2020
$value = \simdjson_decode($json, false);
2121
debug_zval_dump(@key($value[0]));
22-
--EXPECTF--
22+
23+
$value = \json_decode($json, true);
24+
debug_zval_dump(array_key_first($value[2]));
25+
26+
$value = \simdjson_decode($json, true);
27+
debug_zval_dump(array_key_first($value[2]));
28+
29+
$value = \json_decode($json, false);
30+
debug_zval_dump(@key($value[2]));
31+
32+
$value = \simdjson_decode($json, false);
33+
debug_zval_dump(@key($value[2]));
34+
--EXPECT--
2335
string(4) "ahoj" refcount(2)
24-
string(4) "ahoj" refcount(3)
36+
string(4) "ahoj" refcount(4)
2537
string(4) "ahoj" refcount(2)
26-
string(4) "ahoj" refcount(3)
38+
string(4) "ahoj" refcount(4)
39+
string(65) "very_long_key_that_will_not_be_deduplicated_by_simdjson_extension" refcount(2)
40+
string(65) "very_long_key_that_will_not_be_deduplicated_by_simdjson_extension" refcount(2)
41+
string(65) "very_long_key_that_will_not_be_deduplicated_by_simdjson_extension" refcount(2)
42+
string(65) "very_long_key_that_will_not_be_deduplicated_by_simdjson_extension" refcount(2)

tests/decode_repeated2.phpt

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
--TEST--
2+
simdjson_decode repeated strings between calls
3+
--SKIPIF--
4+
<?php
5+
if (PHP_VERSION_ID < 80200) echo "skip deduplication is supported since PHP 8.2\n";
6+
?>
7+
--FILE--
8+
<?php
9+
$json = '[{"ahoj":"svete"}]';
10+
11+
$value1 = \simdjson_decode($json, true);
12+
debug_zval_dump(array_key_first($value1[0]));
13+
14+
$value2 = \simdjson_decode($json, true);
15+
debug_zval_dump(array_key_first($value2[0]));
16+
17+
simdjson_cleanup(); // removes also allocated strings
18+
19+
debug_zval_dump(array_key_first($value1[0]));
20+
21+
--EXPECT--
22+
string(4) "ahoj" refcount(3)
23+
string(4) "ahoj" refcount(4)
24+
string(4) "ahoj" refcount(3)

0 commit comments

Comments
 (0)