| 
50 | 50 | // apple silicon can run most x86-64 instructions, but not necessarily all  | 
51 | 51 | #define THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER 1  | 
52 | 52 | #elif defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE2_BITPERM) && \  | 
53 |  | -    __has_include(<arm_neon_sve_bridge.h>)  | 
 | 53 | +    __has_include(<arm_neon_sve_bridge.h>) && !FOLLY_MOBILE  | 
54 | 54 | #define THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER 1  | 
55 | 55 | #else  | 
56 | 56 | #define THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER 0  | 
57 | 57 | #endif  | 
58 | 58 | 
 
  | 
59 | 59 | #if THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER && FOLLY_AARCH64  | 
 | 60 | +#include <arm_neon.h>  | 
60 | 61 | #include <arm_neon_sve_bridge.h> // @manual  | 
61 | 62 | #include <arm_sve.h>  | 
62 | 63 | #endif  | 
@@ -430,20 +431,102 @@ uint8_t writeVarintUnrolled(Cursor& c, T value) {  | 
430 | 431 | 
 
  | 
431 | 432 | #if THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER  | 
432 | 433 | 
 
  | 
 | 434 | +#if FOLLY_AARCH64  | 
 | 435 | + | 
 | 436 | +template <class Cursor, class T>  | 
 | 437 | +uint8_t writeVarintSve(Cursor& c, T valueS) {  | 
 | 438 | +  auto value = folly::to_unsigned(valueS);  | 
 | 439 | +  if (FOLLY_LIKELY((value & ~0x7f) == 0)) {  | 
 | 440 | +    c.template write<uint8_t>(static_cast<uint8_t>(value));  | 
 | 441 | +    return 1;  | 
 | 442 | +  }  | 
 | 443 | + | 
 | 444 | +  if constexpr (sizeof(T) == 1) {  | 
 | 445 | +    c.template write<uint16_t>(static_cast<uint16_t>(value | 0x100));  | 
 | 446 | +    return 2;  | 
 | 447 | +  }  | 
 | 448 | + | 
 | 449 | +  enum { maxSize = (8 * sizeof(T) + 6) / 7 };  | 
 | 450 | +  c.ensure(maxSize);  | 
 | 451 | + | 
 | 452 | +  svuint8_t bdepMask = svset_neonq_u8(svundef_u8(), vdupq_n_u8(0x7f));  | 
 | 453 | +  uint64x2_t clzMask = vreinterpretq_u64_u8(vdupq_n_u8(0xff));  | 
 | 454 | +  uint64x2_t vec;  | 
 | 455 | +  vec[0] = value;  | 
 | 456 | + | 
 | 457 | +  vec = svget_neonq_u64(svbdep_u64(  | 
 | 458 | +      svset_neonq_u64(svundef_u64(), vec), svreinterpret_u64_u8(bdepMask)));  | 
 | 459 | + | 
 | 460 | +  svuint64_t clzV;  | 
 | 461 | +  uint64x2_t clzMaskV;  | 
 | 462 | +  if constexpr (sizeof(T) == 2) {  | 
 | 463 | +    clzV = svset_neonq_u64(  | 
 | 464 | +        svundef_u64(),  | 
 | 465 | +        vreinterpretq_u64_u32(vclzq_u32(vreinterpretq_u32_u64(vec))));  | 
 | 466 | +    clzMaskV = vreinterpretq_u64_u32(svget_neonq_u32(svlsr_u32_x(  | 
 | 467 | +        svptrue_b32(),  | 
 | 468 | +        svset_neonq_u32(svundef_u32(), vreinterpretq_u32_u64(clzMask)),  | 
 | 469 | +        svreinterpret_u32_u64(clzV))));  | 
 | 470 | +  } else {  | 
 | 471 | +    clzV = svclz_u64_x(svptrue_b64(), svset_neonq_u64(svundef_u64(), vec));  | 
 | 472 | +    clzMaskV = svget_neonq_u64(svlsr_u64_x(  | 
 | 473 | +        svptrue_b64(), svset_neonq_u64(svundef_u64(), clzMask), clzV));  | 
 | 474 | +  }  | 
 | 475 | + | 
 | 476 | +  svuint64_t sizeSV = svlsr_n_u64_x(svptrue_b64(), clzV, 3);  | 
 | 477 | + | 
 | 478 | +  if constexpr (sizeof(T) == 2) {  | 
 | 479 | +    sizeSV = svsubr_n_u64_x(svptrue_b64(), sizeSV, 4);  | 
 | 480 | +  } else {  | 
 | 481 | +    sizeSV = svsubr_n_u64_x(svptrue_b64(), sizeSV, 8);  | 
 | 482 | +  }  | 
 | 483 | + | 
 | 484 | +  vec = vreinterpretq_u64_u8(svget_neonq_u8(svorr_n_u8_x(  | 
 | 485 | +      svptrue_b8(),  | 
 | 486 | +      svset_neonq_u8(svundef_u8(), vreinterpretq_u8_u64(vec)),  | 
 | 487 | +      0x80)));  | 
 | 488 | + | 
 | 489 | +  vec = vandq_u64(vec, clzMaskV);  | 
 | 490 | + | 
 | 491 | +  if constexpr (sizeof(T) == 8) {  | 
 | 492 | +    uint8_t orMask = value < (1ull << 56) ? 0 : 0x80;  | 
 | 493 | +    uint64x2_t orMaskV = vreinterpretq_u64_u8(vdupq_n_u8(orMask));  | 
 | 494 | +    vec = vorrq_u64(vec, orMaskV);  | 
 | 495 | +  }  | 
 | 496 | + | 
 | 497 | +  uint8_t* p = c.writableData();  | 
 | 498 | + | 
 | 499 | +  if constexpr (sizeof(T) == sizeof(uint16_t)) {  | 
 | 500 | +    vst1q_lane_u16(  | 
 | 501 | +        reinterpret_cast<uint16_t*>(p), vreinterpretq_u16_u64(vec), 0);  | 
 | 502 | +    vst1q_lane_u8(p + 2, vreinterpretq_u8_u64(vec), 2);  | 
 | 503 | +  } else if constexpr (sizeof(T) == sizeof(uint32_t)) {  | 
 | 504 | +    vst1q_lane_u32(  | 
 | 505 | +        reinterpret_cast<uint32_t*>(p), vreinterpretq_u32_u64(vec), 0);  | 
 | 506 | +    vst1q_lane_u8(p + 4, vreinterpretq_u8_u64(vec), 4);  | 
 | 507 | +  } else {  | 
 | 508 | +    vst1q_lane_u64(reinterpret_cast<uint64_t*>(p), vec, 0);  | 
 | 509 | +    p[8] = value >> 56;  | 
 | 510 | +    p[9] = value >> 63;  | 
 | 511 | +  }  | 
 | 512 | + | 
 | 513 | +  uint8_t size = vreinterpretq_u8_u64(svget_neonq_u64(sizeSV))[0];  | 
 | 514 | +  if constexpr (sizeof(T) == 8) {  | 
 | 515 | +    size = value < (1ull << 56) ? size : (value >> 63) + 9;  | 
 | 516 | +  }  | 
 | 517 | + | 
 | 518 | +  c.append(size);  | 
 | 519 | +  return size;  | 
 | 520 | +}  | 
 | 521 | + | 
 | 522 | +#else  | 
 | 523 | + | 
433 | 524 | inline uint64_t compressBits(uint64_t value, uint64_t mask) {  | 
434 |  | -#if FOLLY_X64  | 
435 | 525 |   return _pdep_u64(value, mask);  | 
436 |  | -#elif FOLLY_AARCH64  | 
437 |  | -  // See https://godbolt.org/z/nhc443acd  | 
438 |  | -  const auto vec = svbdep_u64(svdup_n_u64(value), svdup_n_u64(mask));  | 
439 |  | -  return vgetq_lane_u64(svget_neonq_u64(vec), 0);  | 
440 |  | -#else  | 
441 |  | -  static_assert(0, "no pdep-equivalent instruction is available");  | 
442 |  | -#endif // __BMI2__, __ARM_FEATURE_SVE2_BITPERM  | 
443 | 526 | }  | 
444 | 527 | 
 
  | 
445 | 528 | template <class Cursor, class T>  | 
446 |  | -uint8_t writeVarintBranchFree(Cursor& c, T valueS) {  | 
 | 529 | +uint8_t writeVarintBranchFreeX86(Cursor& c, T valueS) {  | 
447 | 530 |   auto value = folly::to_unsigned(valueS);  | 
448 | 531 |   if (FOLLY_LIKELY((value & ~0x7f) == 0)) {  | 
449 | 532 |     c.template write<uint8_t>(static_cast<uint8_t>(value));  | 
@@ -494,6 +577,17 @@ uint8_t writeVarintBranchFree(Cursor& c, T valueS) {  | 
494 | 577 |   return size;  | 
495 | 578 | }  | 
496 | 579 | 
 
  | 
 | 580 | +#endif  | 
 | 581 | + | 
 | 582 | +template <class Cursor, class T>  | 
 | 583 | +uint8_t writeVarintBranchFree(Cursor& c, T valueS) {  | 
 | 584 | +#if FOLLY_AARCH64  | 
 | 585 | +  return writeVarintSve(c, valueS);  | 
 | 586 | +#else  | 
 | 587 | +  return writeVarintBranchFreeX86(c, valueS);  | 
 | 588 | +#endif  | 
 | 589 | +}  | 
 | 590 | + | 
497 | 591 | template <class Cursor, class T>  | 
498 | 592 | uint8_t writeVarint(Cursor& c, T value) {  | 
499 | 593 |   return writeVarintBranchFree(c, value);  | 
 | 
0 commit comments