Skip to content

Commit 787f985

Browse files
committed
Improve: Parsing via simdjson
1 parent 89e72b3 commit 787f985

File tree

3 files changed

+131
-1
lines changed

3 files changed

+131
-1
lines changed

.vscode/settings.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
"Lelbach",
6969
"Lemire",
7070
"Lib",
71+
"libopenblas",
7172
"LIBPFM",
7273
"libunifex",
7374
"liburing",
@@ -273,5 +274,7 @@
273274
"variant": "cpp",
274275
"vector": "cpp",
275276
"version": "cpp"
276-
}
277+
},
278+
"C_Cpp.errorSquiggles": "disabled",
279+
"cSpell.enabled": false
277280
}

CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,14 @@ FetchContent_Declare(
308308
)
309309
FetchContent_MakeAvailable(YaoyuanGuoYYJSON)
310310

311+
# Daniel Lemire's simdjson for SIMD-accelerated JSON parsing
312+
FetchContent_Declare(
313+
DanielLemireSimdJSON
314+
GIT_REPOSITORY https://github.com/simdjson/simdjson.git
315+
GIT_TAG v3.13.0
316+
)
317+
FetchContent_MakeAvailable(DanielLemireSimdJSON)
318+
311319
# Chris Karloff's ASIO standalone, avoiding Boost... integration is a bit tricky:
312320
# https://github.com/cpm-cmake/CPM.cmake/blob/master/examples/asio-standalone/CMakeLists.txt
313321
FetchContent_Declare(
@@ -454,6 +462,7 @@ target_link_libraries(
454462
unifex
455463
stringzilla
456464
yyjson
465+
simdjson
457466
ctre
458467
asio
459468
# There is no `absl` shortcut:

less_slow.cpp

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5220,6 +5220,124 @@ BENCHMARK(json_nlohmann<arena_json, exception_handling_t::noexcept_k>)
52205220
->Name("json_nlohmann<arena_allocator, noexcept>")
52215221
->Threads(physical_cores());
52225222

5223+
/**
5224+
* simdjson is designed for high-performance JSON parsing using SIMD instructions.
5225+
* It provides On-Demand parsing which is particularly efficient for selective data extraction.
5226+
*/
5227+
#include <simdjson.h>
5228+
5229+
bool contains_xss_in_simdjson_ondemand(simdjson::ondemand::value element) noexcept {
5230+
5231+
// Handle objects
5232+
if (element.type() == simdjson::ondemand::json_type::object) {
5233+
simdjson::ondemand::object obj;
5234+
if (element.get_object().get(obj) == simdjson::SUCCESS) {
5235+
for (auto sub : obj) {
5236+
simdjson::ondemand::value val;
5237+
if (sub.value().get(val) == simdjson::SUCCESS)
5238+
if (contains_xss_in_simdjson_ondemand(val)) return true;
5239+
}
5240+
}
5241+
return false;
5242+
}
5243+
// Handle arrays
5244+
else if (element.type() == simdjson::ondemand::json_type::array) {
5245+
simdjson::ondemand::array arr;
5246+
if (element.get_array().get(arr) == simdjson::SUCCESS) {
5247+
for (auto sub : arr) {
5248+
simdjson::ondemand::value val;
5249+
if (sub.get(val) == simdjson::SUCCESS)
5250+
if (contains_xss_in_simdjson_ondemand(val)) return true;
5251+
}
5252+
}
5253+
return false;
5254+
}
5255+
// Handle strings
5256+
else if (element.type() == simdjson::ondemand::json_type::string) {
5257+
std::string_view str;
5258+
if (element.get_string().get(str) == simdjson::SUCCESS)
5259+
return str.find("<script>alert('XSS')</script>") != std::string_view::npos;
5260+
}
5261+
return false;
5262+
}
5263+
5264+
bool contains_xss_in_simdjson_dom(simdjson::dom::element element) noexcept {
5265+
if (element.is_object()) {
5266+
for (auto [key, val] : element.get_object())
5267+
if (contains_xss_in_simdjson_dom(val)) return true;
5268+
}
5269+
else if (element.is_array()) {
5270+
for (auto val : element.get_array())
5271+
if (contains_xss_in_simdjson_dom(val)) return true;
5272+
}
5273+
else if (element.is_string()) {
5274+
std::string_view str = element.get_string();
5275+
return str.find("<script>alert('XSS')</script>") != std::string_view::npos;
5276+
}
5277+
return false;
5278+
}
5279+
5280+
static void json_simdjson_ondemand(bm::State &state) {
5281+
std::size_t bytes_processed = 0;
5282+
std::size_t iteration = 0;
5283+
5284+
// Pre-allocate padded strings outside the hot path
5285+
simdjson::padded_string padded_strings[3] = {
5286+
simdjson::padded_string(packets_json[0]),
5287+
simdjson::padded_string(packets_json[1]),
5288+
simdjson::padded_string(packets_json[2]),
5289+
};
5290+
5291+
// On-demand parser reuses internal buffers
5292+
simdjson::ondemand::parser parser;
5293+
simdjson::ondemand::document doc;
5294+
5295+
for (auto _ : state) {
5296+
std::size_t const packet_index = iteration++ % 3;
5297+
bytes_processed += packets_json[packet_index].size();
5298+
5299+
auto error = parser.iterate(padded_strings[packet_index]).get(doc);
5300+
if (error == simdjson::SUCCESS) {
5301+
simdjson::ondemand::value root;
5302+
if (doc.get_value().get(root) == simdjson::SUCCESS)
5303+
bm::DoNotOptimize(contains_xss_in_simdjson_ondemand(root));
5304+
}
5305+
}
5306+
5307+
state.SetBytesProcessed(bytes_processed);
5308+
}
5309+
5310+
static void json_simdjson_dom(bm::State &state) {
5311+
std::size_t bytes_processed = 0;
5312+
std::size_t iteration = 0;
5313+
5314+
// Pre-allocate padded strings outside the hot path
5315+
simdjson::padded_string padded_strings[3] = {
5316+
simdjson::padded_string(packets_json[0]),
5317+
simdjson::padded_string(packets_json[1]),
5318+
simdjson::padded_string(packets_json[2]),
5319+
};
5320+
5321+
// Reuse the state
5322+
simdjson::dom::parser parser;
5323+
simdjson::dom::element doc;
5324+
5325+
for (auto _ : state) {
5326+
std::size_t const packet_index = iteration++ % 3;
5327+
bytes_processed += packets_json[packet_index].size();
5328+
5329+
auto error = parser.parse(padded_strings[packet_index]).get(doc);
5330+
if (error == simdjson::SUCCESS) bm::DoNotOptimize(contains_xss_in_simdjson_dom(doc));
5331+
}
5332+
5333+
state.SetBytesProcessed(bytes_processed);
5334+
}
5335+
5336+
BENCHMARK(json_simdjson_ondemand)->MinTime(10)->Name("json_simdjson<ondemand>");
5337+
BENCHMARK(json_simdjson_dom)->MinTime(10)->Name("json_simdjson<dom>");
5338+
BENCHMARK(json_simdjson_ondemand)->MinTime(10)->Name("json_simdjson<ondemand>")->Threads(physical_cores());
5339+
BENCHMARK(json_simdjson_dom)->MinTime(10)->Name("json_simdjson<dom>")->Threads(physical_cores());
5340+
52235341
/**
52245342
* The results for the single-threaded case and the multi-threaded case without
52255343
* Simultaneous Multi-Threading @b (SMT), with 96 threads on 96 Sapphire Rapids

0 commit comments

Comments
 (0)