|
33 | 33 | #include <nmmintrin.h>
|
34 | 34 | #elif defined(RAPIDJSON_SSE2)
|
35 | 35 | #include <emmintrin.h>
|
| 36 | +#elif defined(RAPIDJSON_NEON) |
| 37 | +#include <arm_neon.h> |
36 | 38 | #endif
|
37 | 39 |
|
38 | 40 | #ifdef _MSC_VER
|
@@ -411,7 +413,92 @@ inline const char *SkipWhitespace_SIMD(const char* p, const char* end) {
|
411 | 413 | return SkipWhitespace(p, end);
|
412 | 414 | }
|
413 | 415 |
|
414 |
| -#endif // RAPIDJSON_SSE2 |
| 416 | +#elif defined(RAPIDJSON_NEON) |
| 417 | + |
| 418 | +//! Skip whitespace with ARM Neon instructions, testing 16 8-byte characters at once. |
| 419 | +inline const char *SkipWhitespace_SIMD(const char* p) { |
| 420 | + // Fast return for single non-whitespace |
| 421 | + if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') |
| 422 | + ++p; |
| 423 | + else |
| 424 | + return p; |
| 425 | + |
| 426 | + // 16-byte align to the next boundary |
| 427 | + const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15)); |
| 428 | + while (p != nextAligned) |
| 429 | + if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') |
| 430 | + ++p; |
| 431 | + else |
| 432 | + return p; |
| 433 | + |
| 434 | + const uint8x16_t w0 = vmovq_n_u8(' '); |
| 435 | + const uint8x16_t w1 = vmovq_n_u8('\n'); |
| 436 | + const uint8x16_t w2 = vmovq_n_u8('\r'); |
| 437 | + const uint8x16_t w3 = vmovq_n_u8('\t'); |
| 438 | + |
| 439 | + for (;; p += 16) { |
| 440 | + const uint8x16_t s = vld1q_u8(reinterpret_cast<const uint8_t *>(p)); |
| 441 | + uint8x16_t x = vceqq_u8(s, w0); |
| 442 | + x = vorrq_u8(x, vceqq_u8(s, w1)); |
| 443 | + x = vorrq_u8(x, vceqq_u8(s, w2)); |
| 444 | + x = vorrq_u8(x, vceqq_u8(s, w3)); |
| 445 | + |
| 446 | + x = vmvnq_u8(x); // Negate |
| 447 | + x = vrev64q_u8(x); // Rev in 64 |
| 448 | + uint64_t low = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 0); // extract |
| 449 | + uint64_t high = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 1); // extract |
| 450 | + |
| 451 | + if (low == 0) { |
| 452 | + if (high != 0) { |
| 453 | + int lz =__builtin_clzll(high);; |
| 454 | + return p + 8 + (lz >> 3); |
| 455 | + } |
| 456 | + } else { |
| 457 | + int lz = __builtin_clzll(low);; |
| 458 | + return p + (lz >> 3); |
| 459 | + } |
| 460 | + } |
| 461 | +} |
| 462 | + |
| 463 | +inline const char *SkipWhitespace_SIMD(const char* p, const char* end) { |
| 464 | + // Fast return for single non-whitespace |
| 465 | + if (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')) |
| 466 | + ++p; |
| 467 | + else |
| 468 | + return p; |
| 469 | + |
| 470 | + const uint8x16_t w0 = vmovq_n_u8(' '); |
| 471 | + const uint8x16_t w1 = vmovq_n_u8('\n'); |
| 472 | + const uint8x16_t w2 = vmovq_n_u8('\r'); |
| 473 | + const uint8x16_t w3 = vmovq_n_u8('\t'); |
| 474 | + |
| 475 | + for (; p <= end - 16; p += 16) { |
| 476 | + const uint8x16_t s = vld1q_u8(reinterpret_cast<const uint8_t *>(p)); |
| 477 | + uint8x16_t x = vceqq_u8(s, w0); |
| 478 | + x = vorrq_u8(x, vceqq_u8(s, w1)); |
| 479 | + x = vorrq_u8(x, vceqq_u8(s, w2)); |
| 480 | + x = vorrq_u8(x, vceqq_u8(s, w3)); |
| 481 | + |
| 482 | + x = vmvnq_u8(x); // Negate |
| 483 | + x = vrev64q_u8(x); // Rev in 64 |
| 484 | + uint64_t low = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 0); // extract |
| 485 | + uint64_t high = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 1); // extract |
| 486 | + |
| 487 | + if (low == 0) { |
| 488 | + if (high != 0) { |
| 489 | + int lz = __builtin_clzll(high); |
| 490 | + return p + 8 + (lz >> 3); |
| 491 | + } |
| 492 | + } else { |
| 493 | + int lz = __builtin_clzll(low); |
| 494 | + return p + (lz >> 3); |
| 495 | + } |
| 496 | + } |
| 497 | + |
| 498 | + return SkipWhitespace(p, end); |
| 499 | +} |
| 500 | + |
| 501 | +#endif // RAPIDJSON_NEON |
415 | 502 |
|
416 | 503 | #ifdef RAPIDJSON_SIMD
|
417 | 504 | //! Template function specialization for InsituStringStream
|
@@ -1129,7 +1216,180 @@ class GenericReader {
|
1129 | 1216 |
|
1130 | 1217 | is.src_ = is.dst_ = p;
|
1131 | 1218 | }
|
1132 |
| -#endif |
| 1219 | +#elif defined(RAPIDJSON_NEON) |
| 1220 | + // StringStream -> StackStream<char> |
| 1221 | + static RAPIDJSON_FORCEINLINE void ScanCopyUnescapedString(StringStream& is, StackStream<char>& os) { |
| 1222 | + const char* p = is.src_; |
| 1223 | + |
| 1224 | + // Scan one by one until alignment (unaligned load may cross page boundary and cause crash) |
| 1225 | + const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15)); |
| 1226 | + while (p != nextAligned) |
| 1227 | + if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast<unsigned>(*p) < 0x20)) { |
| 1228 | + is.src_ = p; |
| 1229 | + return; |
| 1230 | + } |
| 1231 | + else |
| 1232 | + os.Put(*p++); |
| 1233 | + |
| 1234 | + // The rest of string using SIMD |
| 1235 | + const uint8x16_t s0 = vmovq_n_u8('"'); |
| 1236 | + const uint8x16_t s1 = vmovq_n_u8('\\'); |
| 1237 | + const uint8x16_t s2 = vmovq_n_u8('\b'); |
| 1238 | + const uint8x16_t s3 = vmovq_n_u8(32); |
| 1239 | + |
| 1240 | + for (;; p += 16) { |
| 1241 | + const uint8x16_t s = vld1q_u8(reinterpret_cast<const uint8_t *>(p)); |
| 1242 | + uint8x16_t x = vceqq_u8(s, s0); |
| 1243 | + x = vorrq_u8(x, vceqq_u8(s, s1)); |
| 1244 | + x = vorrq_u8(x, vceqq_u8(s, s2)); |
| 1245 | + x = vorrq_u8(x, vcltq_u8(s, s3)); |
| 1246 | + |
| 1247 | + x = vrev64q_u8(x); // Rev in 64 |
| 1248 | + uint64_t low = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 0); // extract |
| 1249 | + uint64_t high = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 1); // extract |
| 1250 | + |
| 1251 | + SizeType length = 0; |
| 1252 | + bool escaped = false; |
| 1253 | + if (low == 0) { |
| 1254 | + if (high != 0) { |
| 1255 | + unsigned lz = (unsigned)__builtin_clzll(high);; |
| 1256 | + length = 8 + (lz >> 3); |
| 1257 | + escaped = true; |
| 1258 | + } |
| 1259 | + } else { |
| 1260 | + unsigned lz = (unsigned)__builtin_clzll(low);; |
| 1261 | + length = lz >> 3; |
| 1262 | + escaped = true; |
| 1263 | + } |
| 1264 | + if (RAPIDJSON_UNLIKELY(escaped)) { // some of characters is escaped |
| 1265 | + if (length != 0) { |
| 1266 | + char* q = reinterpret_cast<char*>(os.Push(length)); |
| 1267 | + for (size_t i = 0; i < length; i++) |
| 1268 | + q[i] = p[i]; |
| 1269 | + |
| 1270 | + p += length; |
| 1271 | + } |
| 1272 | + break; |
| 1273 | + } |
| 1274 | + vst1q_u8(reinterpret_cast<uint8_t *>(os.Push(16)), s); |
| 1275 | + } |
| 1276 | + |
| 1277 | + is.src_ = p; |
| 1278 | + } |
| 1279 | + |
| 1280 | + // InsituStringStream -> InsituStringStream |
| 1281 | + static RAPIDJSON_FORCEINLINE void ScanCopyUnescapedString(InsituStringStream& is, InsituStringStream& os) { |
| 1282 | + RAPIDJSON_ASSERT(&is == &os); |
| 1283 | + (void)os; |
| 1284 | + |
| 1285 | + if (is.src_ == is.dst_) { |
| 1286 | + SkipUnescapedString(is); |
| 1287 | + return; |
| 1288 | + } |
| 1289 | + |
| 1290 | + char* p = is.src_; |
| 1291 | + char *q = is.dst_; |
| 1292 | + |
| 1293 | + // Scan one by one until alignment (unaligned load may cross page boundary and cause crash) |
| 1294 | + const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15)); |
| 1295 | + while (p != nextAligned) |
| 1296 | + if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast<unsigned>(*p) < 0x20)) { |
| 1297 | + is.src_ = p; |
| 1298 | + is.dst_ = q; |
| 1299 | + return; |
| 1300 | + } |
| 1301 | + else |
| 1302 | + *q++ = *p++; |
| 1303 | + |
| 1304 | + // The rest of string using SIMD |
| 1305 | + const uint8x16_t s0 = vmovq_n_u8('"'); |
| 1306 | + const uint8x16_t s1 = vmovq_n_u8('\\'); |
| 1307 | + const uint8x16_t s2 = vmovq_n_u8('\b'); |
| 1308 | + const uint8x16_t s3 = vmovq_n_u8(32); |
| 1309 | + |
| 1310 | + for (;; p += 16, q += 16) { |
| 1311 | + const uint8x16_t s = vld1q_u8(reinterpret_cast<uint8_t *>(p)); |
| 1312 | + uint8x16_t x = vceqq_u8(s, s0); |
| 1313 | + x = vorrq_u8(x, vceqq_u8(s, s1)); |
| 1314 | + x = vorrq_u8(x, vceqq_u8(s, s2)); |
| 1315 | + x = vorrq_u8(x, vcltq_u8(s, s3)); |
| 1316 | + |
| 1317 | + x = vrev64q_u8(x); // Rev in 64 |
| 1318 | + uint64_t low = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 0); // extract |
| 1319 | + uint64_t high = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 1); // extract |
| 1320 | + |
| 1321 | + SizeType length = 0; |
| 1322 | + bool escaped = false; |
| 1323 | + if (low == 0) { |
| 1324 | + if (high != 0) { |
| 1325 | + unsigned lz = (unsigned)__builtin_clzll(high); |
| 1326 | + length = 8 + (lz >> 3); |
| 1327 | + escaped = true; |
| 1328 | + } |
| 1329 | + } else { |
| 1330 | + unsigned lz = (unsigned)__builtin_clzll(low); |
| 1331 | + length = lz >> 3; |
| 1332 | + escaped = true; |
| 1333 | + } |
| 1334 | + if (RAPIDJSON_UNLIKELY(escaped)) { // some of characters is escaped |
| 1335 | + for (const char* pend = p + length; p != pend; ) { |
| 1336 | + *q++ = *p++; |
| 1337 | + } |
| 1338 | + break; |
| 1339 | + } |
| 1340 | + vst1q_u8(reinterpret_cast<uint8_t *>(q), s); |
| 1341 | + } |
| 1342 | + |
| 1343 | + is.src_ = p; |
| 1344 | + is.dst_ = q; |
| 1345 | + } |
| 1346 | + |
| 1347 | + // When read/write pointers are the same for insitu stream, just skip unescaped characters |
| 1348 | + static RAPIDJSON_FORCEINLINE void SkipUnescapedString(InsituStringStream& is) { |
| 1349 | + RAPIDJSON_ASSERT(is.src_ == is.dst_); |
| 1350 | + char* p = is.src_; |
| 1351 | + |
| 1352 | + // Scan one by one until alignment (unaligned load may cross page boundary and cause crash) |
| 1353 | + const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15)); |
| 1354 | + for (; p != nextAligned; p++) |
| 1355 | + if (RAPIDJSON_UNLIKELY(*p == '\"') || RAPIDJSON_UNLIKELY(*p == '\\') || RAPIDJSON_UNLIKELY(static_cast<unsigned>(*p) < 0x20)) { |
| 1356 | + is.src_ = is.dst_ = p; |
| 1357 | + return; |
| 1358 | + } |
| 1359 | + |
| 1360 | + // The rest of string using SIMD |
| 1361 | + const uint8x16_t s0 = vmovq_n_u8('"'); |
| 1362 | + const uint8x16_t s1 = vmovq_n_u8('\\'); |
| 1363 | + const uint8x16_t s2 = vmovq_n_u8('\b'); |
| 1364 | + const uint8x16_t s3 = vmovq_n_u8(32); |
| 1365 | + |
| 1366 | + for (;; p += 16) { |
| 1367 | + const uint8x16_t s = vld1q_u8(reinterpret_cast<uint8_t *>(p)); |
| 1368 | + uint8x16_t x = vceqq_u8(s, s0); |
| 1369 | + x = vorrq_u8(x, vceqq_u8(s, s1)); |
| 1370 | + x = vorrq_u8(x, vceqq_u8(s, s2)); |
| 1371 | + x = vorrq_u8(x, vcltq_u8(s, s3)); |
| 1372 | + |
| 1373 | + x = vrev64q_u8(x); // Rev in 64 |
| 1374 | + uint64_t low = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 0); // extract |
| 1375 | + uint64_t high = vgetq_lane_u64(reinterpret_cast<uint64x2_t>(x), 1); // extract |
| 1376 | + |
| 1377 | + if (low == 0) { |
| 1378 | + if (high != 0) { |
| 1379 | + int lz = __builtin_clzll(high); |
| 1380 | + p += 8 + (lz >> 3); |
| 1381 | + break; |
| 1382 | + } |
| 1383 | + } else { |
| 1384 | + int lz = __builtin_clzll(low); |
| 1385 | + p += lz >> 3; |
| 1386 | + break; |
| 1387 | + } |
| 1388 | + } |
| 1389 | + |
| 1390 | + is.src_ = is.dst_ = p; |
| 1391 | + } |
| 1392 | +#endif // RAPIDJSON_NEON |
1133 | 1393 |
|
1134 | 1394 | template<typename InputStream, bool backup, bool pushOnTake>
|
1135 | 1395 | class NumberStream;
|
|
0 commit comments