Skip to content

Commit 5db6b5c

Browse files
committed
improve x64 inline asm in 128bit REDC
1 parent 37126f8 commit 5db6b5c

File tree

5 files changed

+44
-45
lines changed

5 files changed

+44
-45
lines changed

modular_arithmetic/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ include(FetchContent)
7575
FetchContent_Declare(
7676
hurchalla_util
7777
GIT_REPOSITORY https://github.com/hurchalla/util.git
78-
GIT_TAG 6901743704ac1caf4e99090ce52e52a40147ba82
78+
GIT_TAG 9fac434b586717052c648339eb0f0f89d23e0298
7979
)
8080
FetchContent_MakeAvailable(hurchalla_util)
8181

montgomery_arithmetic/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ include(FetchContent)
7979
FetchContent_Declare(
8080
hurchalla_util
8181
GIT_REPOSITORY https://github.com/hurchalla/util.git
82-
GIT_TAG 6901743704ac1caf4e99090ce52e52a40147ba82
82+
GIT_TAG 9fac434b586717052c648339eb0f0f89d23e0298
8383
)
8484
FetchContent_MakeAvailable(hurchalla_util)
8585

montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_two_pow/experimental_montgomery_two_pow.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3572,7 +3572,7 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
35723572
if (n <= MASKBIG) {
35733573
size_t loindex = static_cast<size_t>(n);
35743574
HPBC_CLOCKWORK_ASSERT2(loindex < ut_numeric_limits<RU>::digits);
3575-
#if defined(__GNUC__) && !defined(__clang__)
3575+
#if defined(__GNUC__) && (__GNUC__ >= 14) && !defined(__clang__)
35763576
# pragma GCC diagnostic push
35773577
# pragma GCC diagnostic ignored "-Wnrvo"
35783578
#endif

montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_two_pow/testbench.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -466,8 +466,6 @@ optimization_level=$2
466466
define_mont_type=-DDEF_MONT_TYPE=$3
467467
define_uint_type=-DDEF_UINT_TYPE=$4
468468

469-
define_use_asm=$8
470-
471469

472470
cpp_standard=c++17
473471

@@ -479,7 +477,9 @@ cpp_standard=c++17
479477
# SET repo_directory TO THE DIRECTORY WHERE YOU CLONED THE HURCHALLA GIT
480478
# REPOSITORIES. (or otherwise ensure the compiler /I flags correctly specify
481479
# the needed hurchalla include directories)
480+
482481
repo_directory=/Users/jeffreyhurchalla/Desktop
482+
#repo_directory=/home/jeff/repos
483483

484484

485485

montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/detail/platform_specific/ImplRedc.h

Lines changed: 39 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -222,13 +222,13 @@ struct RedcIncomplete {
222222
}
223223

224224

225+
225226
#if (HURCHALLA_COMPILER_HAS_UINT128_T())
226-
// It's possible these __uint128_t versions should be better tested than they
227-
// have been so far - I've used the existing REDC unit tests, but little more.
228-
// The performance on m2 is excellent, so long as throughput is needed rather
229-
// than low latency.
230-
// The performance on x86 is unknown at the time of this writing - I haven't
231-
// yet tried it on x86.
227+
// The performance for these __uint128_t versions on m2 is excellent, so long
228+
// as throughput is needed rather than low latency.
229+
// Performance benefits on x64 are similar to ARM64 (m2) - these are much
230+
// faster than the ordinary versions when using LowuopsTag (for throughput),
231+
// and slower when using LowlatencyTag.
232232

233233

234234
// Calculates the minuend and subtrahend of the REDC, such that the finalized
@@ -342,52 +342,50 @@ struct RedcIncomplete {
342342
subtrahend = (static_cast<T>(tmp) << HALF_BITS) | u1;
343343
# endif
344344

345-
/*
346345
#elif (defined(HURCHALLA_ALLOW_INLINE_ASM_ALL) || \
347346
defined(HURCHALLA_ALLOW_INLINE_ASM_REDC)) && \
348347
defined(HURCHALLA_TARGET_ISA_X86_64) && !defined(_MSC_VER)
349-
*/
350-
#elif 0
351-
TH m = u0;
348+
349+
TH tmp = u0;
352350
TH rrax = n0;
353-
TH rrdx, tmp2;
354-
__asm__ ("imulq %[invn0], %[m] \n\t" /* mA = u0 * inv_n */
355-
"mulq %[m] \n\t" /* rdx:rax = mnA_10 = rax * mA (rax == n0); high-order bits of the product in rdx */
356-
"movq %%rdx, %[tmp2] \n\t" /* tmp2 = mnA_1 */
357-
"movq %[n1], %%rax \n\t"
358-
"mulq %[m] \n\t" /* rdx:rax = mnA_21 = n1 * mA */
359-
"xorl %k[m], %k[m] \n\t" /* m = 0 */
360-
"addq %%rax, %[tmp2] \n\t" /* tmp2 = mnA_1 = mnA_1_part2 + mnA_1 */
361-
"adcq %%rdx, %[m] \n\t" /* m = mnA_2 + carry */
362-
"subq %[tmp2], %[u1] \n\t" /* u1 = v1 = u1 - mnA_1 */
351+
TH rrdx;
352+
__asm__ ("imulq %[invn0], %[tmp] \n\t" /* tmp = mA = u0 * inv_n */
353+
"mulq %[tmp] \n\t" /* rdx:rax = mnA_10 = rax * mA (rax == n0); high-order bits of the product in rdx */
354+
"movq %[tmp], %%rax \n\t" /* rax = mA */
355+
"movq %%rdx, %[tmp] \n\t" /* tmp = mnA_1 */
356+
"mulq %[n1] \n\t" /* rdx:rax = mnA_21 = n1 * mA */
357+
"addq %%rax, %[tmp] \n\t" /* tmp = mnA_1 += mnA_1_part2 */
358+
359+
"movq %[n0], %%rax \n\t" /* rax = n0_original */
360+
"movq %%rdx, %[n0] \n\t" /* n0 = mnA_2 */
361+
362+
"adcq $0, %[n0] \n\t" /* mnA_2 += carry */
363+
"subq %[tmp], %[u1] \n\t" /* u1 = v1 = u1 - mnA_1 */
363364
"imulq %[u1], %[invn0] \n\t" /* invn0 = mB = v1 * invn0 */
364365

365-
"movq %[n0], %%rax \n\t"
366-
"mulq %[invn0] \n\t" /* rdx:rax = mnB_21 = n0 * mB */
366+
"mulq %[invn0] \n\t" /* rdx:rax = mnB_21 = n0_original * mB */
367367
"movq %%rax, %[u1] \n\t" /* u1 = mnB_1 */
368-
"movq %%rdx, %[n0] \n\t" /* n0 = mnB_2 */
369-
370-
"movq %[n1], %%rax \n\t"
371-
"mulq %[invn0] \n\t" /* rdx:rax = mnB_32 = n1 * mB */
372-
"xorl %k[invn0], %k[invn0] \n\t" /* invn0 = 0 */
373-
"addq %%rax, %[n0] \n\t" /* n0 = mnB_2 = mnB_2_part2 + mnB_2 */
374-
"adcq %%rdx, %[invn0] \n\t" /* invn0 = mnB_3 = mnB_3 + carry */
375-
376-
"xorl %%eax, %%eax \n\t" /* rax = 0 */
377-
"addq %[u1], %[tmp2] \n\t" /* tmp2 = dummy = mnA_1 + mnB_1 */
378-
"adcq %[n0], %[m] \n\t" /* m = sum2 = mnB_2 + mnA_2 + carry */
379-
"adcq %%rax, %[invn0], hs \n\t" /* tmp = sum3 = mnB_3 += carry */
380-
: [m]"+&r"(m), [invn0]"+&r"(invn0),
381-
"+&a"(rrax), "=&d"(rrdx), [tmp2]"=&r"(tmp2), [n1]"+&r"(n1), [u1]"+&r"(u1),
382-
[n0]"+&r"(n0)
383-
:
368+
369+
"movq %[invn0], %%rax \n\t" /* rax = mB */
370+
"movq %%rdx, %[invn0] \n\t" /* invn0 = mnB_2 */
371+
372+
"mulq %[n1] \n\t" /* rdx:rax = mnB_32 = n1 * mB */
373+
"addq %%rax, %[invn0] \n\t" /* invn0 = mnB_2 += mnB_2_part2 */
374+
"adcq $0, %%rdx \n\t" /* rdx = mnB_3 += carry */
375+
376+
"addq %[u1], %[tmp] \n\t" /* tmp = dummy = mnA_1 + mnB_1 */
377+
"adcq %[invn0], %[n0] \n\t" /* n0 = sum2 = mnB_2 + mnA_2 + carry */
378+
"adcq $0, %%rdx \n\t" /* rdx = sum3 = mnB_3 += carry */
379+
: [invn0]"+&r"(invn0), "+&a"(rrax), "=&d"(rrdx),
380+
[tmp]"+&r"(tmp), [u1]"+&r"(u1), [n0]"+&r"(n0)
381+
: [n1]"r"(n1)
384382
: "cc");
385383
minuend = u_hi;
386-
subtrahend = (static_cast<T>(tmp) << HALF_BITS) | u1;
387-
384+
subtrahend = (static_cast<T>(rrdx) << HALF_BITS) | n0;
388385

389386
#else // not using inline-asm
390387

388+
391389
TH mA = u0 * invn0;
392390

393391
T mnA_10 = static_cast<T>(mA) * n0; // mnA_10 <= (R-1)*(R-1) == R^2 - 2R + 1
@@ -490,6 +488,7 @@ struct RedcIncomplete {
490488
}
491489

492490

491+
493492
// we can implement the above algorithm more straightforwardly and more
494493
// efficiently here, since we return the final subtraction result while
495494
// making no distinction between a positive or negative result.

0 commit comments

Comments
 (0)