|
50 | 50 | // apple silicon can run most x86-64 instructions, but not necessarily all |
51 | 51 | #define THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER 1 |
52 | 52 | #elif defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE2_BITPERM) && \ |
53 | | - __has_include(<arm_neon_sve_bridge.h>) && !FOLLY_MOBILE |
| 53 | + __has_include(<arm_neon_sve_bridge.h>) |
54 | 54 | #define THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER 1 |
55 | 55 | #else |
56 | 56 | #define THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER 0 |
57 | 57 | #endif |
58 | 58 |
|
59 | 59 | #if THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER && FOLLY_AARCH64 |
60 | | -#include <arm_neon.h> |
61 | 60 | #include <arm_neon_sve_bridge.h> // @manual |
62 | 61 | #include <arm_sve.h> |
63 | 62 | #endif |
@@ -431,98 +430,20 @@ uint8_t writeVarintUnrolled(Cursor& c, T value) { |
431 | 430 |
|
432 | 431 | #if THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER |
433 | 432 |
|
434 | | -#if FOLLY_AARCH64 |
435 | | - |
436 | | -template <class Cursor, class T> |
437 | | -uint8_t writeVarintSve(Cursor& c, T valueS) { |
438 | | - auto value = folly::to_unsigned(valueS); |
439 | | - if (FOLLY_LIKELY((value & ~0x7f) == 0)) { |
440 | | - c.template write<uint8_t>(static_cast<uint8_t>(value)); |
441 | | - return 1; |
442 | | - } |
443 | | - |
444 | | - if constexpr (sizeof(T) == 1) { |
445 | | - c.template write<uint16_t>(static_cast<uint16_t>(value | 0x100)); |
446 | | - return 2; |
447 | | - } |
448 | | - |
449 | | - enum { maxSize = (8 * sizeof(T) + 6) / 7 }; |
450 | | - c.ensure(maxSize); |
451 | | - |
452 | | - svuint8_t bdepMask = svset_neonq_u8(svundef_u8(), vdupq_n_u8(0x7f)); |
453 | | - uint64x2_t clzMask = vreinterpretq_u64_u8(vdupq_n_u8(0xff)); |
454 | | - uint64x2_t vec; |
455 | | - vec[0] = value; |
456 | | - |
457 | | - vec = svget_neonq_u64(svbdep_u64( |
458 | | - svset_neonq_u64(svundef_u64(), vec), svreinterpret_u64_u8(bdepMask))); |
459 | | - |
460 | | - svuint64_t clzV; |
461 | | - uint64x2_t clzMaskV; |
462 | | - if constexpr (sizeof(T) == 2) { |
463 | | - clzV = svset_neonq_u64(svundef_u64(), vclzq_u32(vec)); |
464 | | - clzMaskV = svget_neonq_u32(svlsr_u32_x( |
465 | | - svptrue_b32(), |
466 | | - svset_neonq_u32(svundef_u32(), vreinterpretq_u32_u64(clzMask)), |
467 | | - svreinterpret_u32_u64(clzV))); |
468 | | - } else { |
469 | | - clzV = svclz_u64_x(svptrue_b64(), svset_neonq_u64(svundef_u64(), vec)); |
470 | | - clzMaskV = svget_neonq_u64(svlsr_u64_x( |
471 | | - svptrue_b64(), svset_neonq_u64(svundef_u64(), clzMask), clzV)); |
472 | | - } |
473 | | - |
474 | | - svuint64_t sizeSV = svlsr_n_u64_x(svptrue_b64(), clzV, 3); |
475 | | - |
476 | | - if constexpr (sizeof(T) == 2) { |
477 | | - sizeSV = svsubr_n_u64_x(svptrue_b64(), sizeSV, 4); |
478 | | - } else { |
479 | | - sizeSV = svsubr_n_u64_x(svptrue_b64(), sizeSV, 8); |
480 | | - } |
481 | | - |
482 | | - vec = svget_neonq_u8(svorr_n_u8_x( |
483 | | - svptrue_b8(), |
484 | | - svset_neonq_u8(svundef_u8(), vreinterpretq_u8_u64(vec)), |
485 | | - 0x80)); |
486 | | - |
487 | | - vec = vandq_u64(vec, clzMaskV); |
488 | | - |
489 | | - if constexpr (sizeof(T) == 8) { |
490 | | - uint64_t orMask = value < (1ull << 56) ? 0 : 0x80; |
491 | | - uint64x2_t orMaskV = vreinterpretq_u64_u8(vdupq_n_u8(orMask)); |
492 | | - vec = vorrq_u64(vec, orMaskV); |
493 | | - } |
494 | | - |
495 | | - uint8_t* p = c.writableData(); |
496 | | - |
497 | | - if constexpr (sizeof(T) == sizeof(uint16_t)) { |
498 | | - vst1q_lane_u16(p, vreinterpretq_u16_u64(vec), 0); |
499 | | - vst1q_lane_u8(p + 2, vreinterpretq_u8_u64(vec), 2); |
500 | | - } else if constexpr (sizeof(T) == sizeof(uint32_t)) { |
501 | | - vst1q_lane_u32(p, vreinterpretq_u32_u64(vec), 0); |
502 | | - vst1q_lane_u8(p + 4, vreinterpretq_u8_u64(vec), 4); |
503 | | - } else { |
504 | | - vst1_u8(p, vget_low_u64(vreinterpretq_u8_u64(vec))); |
505 | | - p[8] = value >> 56; |
506 | | - p[9] = value >> 63; |
507 | | - } |
508 | | - |
509 | | - uint8_t size = svget_neonq_u64(sizeSV)[0]; |
510 | | - if constexpr (sizeof(T) == 8) { |
511 | | - size = value < (1ull << 56) ? size : (value >> 63) + 9; |
512 | | - } |
513 | | - |
514 | | - c.append(size); |
515 | | - return size; |
516 | | -} |
517 | | - |
518 | | -#else |
519 | | - |
520 | 433 | inline uint64_t compressBits(uint64_t value, uint64_t mask) { |
| 434 | +#if FOLLY_X64 |
521 | 435 | return _pdep_u64(value, mask); |
| 436 | +#elif FOLLY_AARCH64 |
| 437 | + // See https://godbolt.org/z/nhc443acd |
| 438 | + const auto vec = svbdep_u64(svdup_n_u64(value), svdup_n_u64(mask)); |
| 439 | + return vgetq_lane_u64(svget_neonq_u64(vec), 0); |
| 440 | +#else |
| 441 | + static_assert(0, "no pdep-equivalent instruction is available"); |
| 442 | +#endif // __BMI2__, __ARM_FEATURE_SVE2_BITPERM |
522 | 443 | } |
523 | 444 |
|
524 | 445 | template <class Cursor, class T> |
525 | | -uint8_t writeVarintBranchFreeX86(Cursor& c, T valueS) { |
| 446 | +uint8_t writeVarintBranchFree(Cursor& c, T valueS) { |
526 | 447 | auto value = folly::to_unsigned(valueS); |
527 | 448 | if (FOLLY_LIKELY((value & ~0x7f) == 0)) { |
528 | 449 | c.template write<uint8_t>(static_cast<uint8_t>(value)); |
@@ -573,17 +494,6 @@ uint8_t writeVarintBranchFreeX86(Cursor& c, T valueS) { |
573 | 494 | return size; |
574 | 495 | } |
575 | 496 |
|
576 | | -#endif |
577 | | - |
578 | | -template <class Cursor, class T> |
579 | | -uint8_t writeVarintBranchFree(Cursor& c, T valueS) { |
580 | | -#if FOLLY_AARCH64 |
581 | | - return writeVarintSve(c, valueS); |
582 | | -#else |
583 | | - return writeVarintBranchFreeX86(c, valueS); |
584 | | -#endif |
585 | | -} |
586 | | - |
587 | 497 | template <class Cursor, class T> |
588 | 498 | uint8_t writeVarint(Cursor& c, T value) { |
589 | 499 | return writeVarintBranchFree(c, value); |
|
0 commit comments