improve x64 inline asm in 128bit REDC

hurchalla · hurchalla · commit 5db6b5cf620f · 2025-10-13T17:46:25.000-07:00
diff --git a/modular_arithmetic/CMakeLists.txt b/modular_arithmetic/CMakeLists.txt
@@ -75,7 +75,7 @@ include(FetchContent)
 FetchContent_Declare(
     hurchalla_util
     GIT_REPOSITORY https://github.com/hurchalla/util.git
-    GIT_TAG        6901743704ac1caf4e99090ce52e52a40147ba82
+    GIT_TAG        9fac434b586717052c648339eb0f0f89d23e0298
 )
 FetchContent_MakeAvailable(hurchalla_util)
 
diff --git a/montgomery_arithmetic/CMakeLists.txt b/montgomery_arithmetic/CMakeLists.txt
@@ -79,7 +79,7 @@ include(FetchContent)
 FetchContent_Declare(
     hurchalla_util
     GIT_REPOSITORY https://github.com/hurchalla/util.git
-    GIT_TAG        6901743704ac1caf4e99090ce52e52a40147ba82
+    GIT_TAG        9fac434b586717052c648339eb0f0f89d23e0298
 )
 FetchContent_MakeAvailable(hurchalla_util)
 
diff --git a/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_two_pow/experimental_montgomery_two_pow.h b/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_two_pow/experimental_montgomery_two_pow.h
@@ -3572,7 +3572,7 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
         if (n <= MASKBIG) {
             size_t loindex = static_cast<size_t>(n);
             HPBC_CLOCKWORK_ASSERT2(loindex < ut_numeric_limits<RU>::digits);
-#if defined(__GNUC__) && !defined(__clang__)
+#if defined(__GNUC__) && (__GNUC__ >= 14) && !defined(__clang__)
 #  pragma GCC diagnostic push
 #  pragma GCC diagnostic ignored "-Wnrvo"
 #endif
diff --git a/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_two_pow/testbench.sh b/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_two_pow/testbench.sh
@@ -466,8 +466,6 @@ optimization_level=$2
 define_mont_type=-DDEF_MONT_TYPE=$3
 define_uint_type=-DDEF_UINT_TYPE=$4
 
-define_use_asm=$8
-
 
 cpp_standard=c++17
 
@@ -479,7 +477,9 @@ cpp_standard=c++17
 # SET repo_directory TO THE DIRECTORY WHERE YOU CLONED THE HURCHALLA GIT
 # REPOSITORIES.  (or otherwise ensure the compiler /I flags correctly specify
 # the needed hurchalla include directories)
+
 repo_directory=/Users/jeffreyhurchalla/Desktop
+#repo_directory=/home/jeff/repos
 
 
 
diff --git a/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/detail/platform_specific/ImplRedc.h b/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/detail/platform_specific/ImplRedc.h
@@ -222,13 +222,13 @@ struct RedcIncomplete {
   }
 
 
+
 #if (HURCHALLA_COMPILER_HAS_UINT128_T())
-  // It's possible these __uint128_t versions should be better tested than they
-  // have been so far - I've used the existing REDC unit tests, but little more.
-  // The performance on m2 is excellent, so long as throughput is needed rather
-  // than low latency.
-  // The performance on x86 is unknown at the time of this writing - I haven't
-  // yet tried it on x86.
+  // The performance for these __uint128_t versions on m2 is excellent, so long
+  // as throughput is needed rather than low latency.
+  // Performance benefits on x64 are similar to ARM64 (m2) - these are much
+  // faster than the ordinary versions when using LowuopsTag (for throughput),
+  // and slower when using LowlatencyTag.
 
 
   // Calculates the minuend and subtrahend of the REDC, such that the finalized
@@ -342,52 +342,50 @@ struct RedcIncomplete {
     subtrahend = (static_cast<T>(tmp) << HALF_BITS) | u1;
 # endif
 
-/*
 #elif (defined(HURCHALLA_ALLOW_INLINE_ASM_ALL) || \
      defined(HURCHALLA_ALLOW_INLINE_ASM_REDC)) && \
     defined(HURCHALLA_TARGET_ISA_X86_64) && !defined(_MSC_VER)
-*/
-#elif 0
-    TH m = u0;
+
+    TH tmp = u0;
     TH rrax = n0;
-    TH rrdx, tmp2;
-    __asm__ ("imulq %[invn0], %[m] \n\t"   /* mA = u0 * inv_n */
-             "mulq %[m] \n\t"              /* rdx:rax = mnA_10 = rax * mA (rax == n0); high-order bits of the product in rdx */
-             "movq %%rdx, %[tmp2] \n\t"    /* tmp2 = mnA_1 */
-             "movq %[n1], %%rax \n\t"
-             "mulq %[m] \n\t"              /* rdx:rax = mnA_21 = n1 * mA */
-             "xorl %k[m], %k[m] \n\t"      /* m = 0 */
-             "addq %%rax, %[tmp2] \n\t"    /* tmp2 = mnA_1 = mnA_1_part2 + mnA_1 */
-             "adcq %%rdx, %[m] \n\t"       /* m = mnA_2 + carry */
-             "subq %[tmp2], %[u1] \n\t"    /* u1 = v1 = u1 - mnA_1 */
+    TH rrdx;
+    __asm__ ("imulq %[invn0], %[tmp] \n\t" /* tmp = mA = u0 * inv_n */
+             "mulq %[tmp] \n\t"            /* rdx:rax = mnA_10 = rax * mA (rax == n0); high-order bits of the product in rdx */
+             "movq %[tmp], %%rax \n\t"     /* rax = mA */
+             "movq %%rdx, %[tmp] \n\t"     /* tmp = mnA_1 */
+             "mulq %[n1] \n\t"             /* rdx:rax = mnA_21 = n1 * mA */
+             "addq %%rax, %[tmp] \n\t"     /* tmp = mnA_1 += mnA_1_part2 */
+
+             "movq %[n0], %%rax \n\t"      /* rax = n0_original */
+             "movq %%rdx, %[n0] \n\t"      /* n0 = mnA_2 */
+
+             "adcq $0, %[n0] \n\t"         /* mnA_2 += carry */
+             "subq %[tmp], %[u1] \n\t"     /* u1 = v1 = u1 - mnA_1 */
              "imulq %[u1], %[invn0] \n\t"  /* invn0 = mB = v1 * invn0 */
 
-             "movq %[n0], %%rax \n\t"
-             "mulq %[invn0] \n\t"          /* rdx:rax = mnB_21 = n0 * mB */
+             "mulq %[invn0] \n\t"          /* rdx:rax = mnB_21 = n0_original * mB */
              "movq %%rax, %[u1] \n\t"      /* u1 = mnB_1 */
-             "movq %%rdx, %[n0] \n\t"      /* n0 = mnB_2 */
-
-             "movq %[n1], %%rax \n\t"
-             "mulq %[invn0] \n\t"          /* rdx:rax = mnB_32 = n1 * mB */
-             "xorl %k[invn0], %k[invn0] \n\t"  /* invn0 = 0 */
-             "addq %%rax, %[n0] \n\t"      /* n0 = mnB_2 = mnB_2_part2 + mnB_2 */
-             "adcq %%rdx, %[invn0] \n\t"   /* invn0 = mnB_3 = mnB_3 + carry */
-
-             "xorl %%eax, %%eax \n\t"      /* rax = 0 */
-             "addq %[u1], %[tmp2] \n\t"    /* tmp2 = dummy = mnA_1 + mnB_1 */
-             "adcq %[n0], %[m] \n\t"       /* m = sum2 = mnB_2 + mnA_2 + carry */
-             "adcq %%rax, %[invn0], hs \n\t"  /* tmp = sum3 = mnB_3 += carry */
-             : [m]"+&r"(m), [invn0]"+&r"(invn0),
-               "+&a"(rrax), "=&d"(rrdx), [tmp2]"=&r"(tmp2), [n1]"+&r"(n1), [u1]"+&r"(u1),
-               [n0]"+&r"(n0)
-             :
+
+             "movq %[invn0], %%rax \n\t"   /* rax = mB */
+             "movq %%rdx, %[invn0] \n\t"   /* invn0 = mnB_2 */
+
+             "mulq %[n1] \n\t"             /* rdx:rax = mnB_32 = n1 * mB */
+             "addq %%rax, %[invn0] \n\t"   /* invn0 = mnB_2 += mnB_2_part2 */
+             "adcq $0, %%rdx \n\t"         /* rdx = mnB_3 += carry */
+
+             "addq %[u1], %[tmp] \n\t"     /* tmp = dummy = mnA_1 + mnB_1 */
+             "adcq %[invn0], %[n0] \n\t"   /* n0 = sum2 = mnB_2 + mnA_2 + carry */
+             "adcq $0, %%rdx \n\t"         /* rdx = sum3 = mnB_3 += carry */
+             : [invn0]"+&r"(invn0), "+&a"(rrax), "=&d"(rrdx),
+               [tmp]"+&r"(tmp), [u1]"+&r"(u1), [n0]"+&r"(n0)
+             : [n1]"r"(n1)
              : "cc");
     minuend = u_hi;
-    subtrahend = (static_cast<T>(tmp) << HALF_BITS) | u1;
-
+    subtrahend = (static_cast<T>(rrdx) << HALF_BITS) | n0;
 
 #else  // not using inline-asm
 
+
     TH mA = u0 * invn0;
 
     T mnA_10 = static_cast<T>(mA) * n0;      // mnA_10 <= (R-1)*(R-1) == R^2 - 2R + 1
@@ -490,6 +488,7 @@ struct RedcIncomplete {
   }
 
 
+
   // we can implement the above algorithm more straightforwardly and more
   // efficiently here, since we return the final subtraction result while
   // making no distinction between a positive or negative result.

Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@ include(FetchContent)`
`75`	`75`	`FetchContent_Declare(`
`76`	`76`	`hurchalla_util`
`77`	`77`	`GIT_REPOSITORY https://github.com/hurchalla/util.git`
`78`		`- GIT_TAG 6901743704ac1caf4e99090ce52e52a40147ba82`
	`78`	`+ GIT_TAG 9fac434b586717052c648339eb0f0f89d23e0298`
`79`	`79`	`)`
`80`	`80`	`FetchContent_MakeAvailable(hurchalla_util)`
`81`	`81`
Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ include(FetchContent)`
`79`	`79`	`FetchContent_Declare(`
`80`	`80`	`hurchalla_util`
`81`	`81`	`GIT_REPOSITORY https://github.com/hurchalla/util.git`
`82`		`- GIT_TAG 6901743704ac1caf4e99090ce52e52a40147ba82`
	`82`	`+ GIT_TAG 9fac434b586717052c648339eb0f0f89d23e0298`
`83`	`83`	`)`
`84`	`84`	`FetchContent_MakeAvailable(hurchalla_util)`
`85`	`85`