Skip to content

Commit fd1060d

Browse files
committed
finish adding x64 inline asm in 128bit REDC
1 parent 5db6b5c commit fd1060d

File tree

1 file changed

+43
-0
lines changed
  • montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/detail/platform_specific

1 file changed

+43
-0
lines changed

montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/detail/platform_specific/ImplRedc.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -549,7 +549,50 @@ struct RedcIncomplete {
549549

550550
T t_hi = (static_cast<T>(tmp) << HALF_BITS) | u1;
551551

552+
#elif (defined(HURCHALLA_ALLOW_INLINE_ASM_ALL) || \
553+
defined(HURCHALLA_ALLOW_INLINE_ASM_REDC)) && \
554+
defined(HURCHALLA_TARGET_ISA_X86_64) && !defined(_MSC_VER)
555+
556+
TH u2 = static_cast<TH>(u_hi);
557+
TH u3 = static_cast<TH>(u_hi >> HALF_BITS);
558+
559+
TH tmp = u0;
560+
TH rrax = n0;
561+
TH rrdx;
562+
__asm__ ("imulq %[invn0], %[tmp] \n\t" /* tmp = mA = u0 * inv_n */
563+
"mulq %[tmp] \n\t" /* rdx:rax = mnA_10 = rax * mA (rax == n0); high-order bits of the product in rdx */
564+
"movq %[tmp], %%rax \n\t" /* rax = mA */
565+
"movq %%rdx, %[tmp] \n\t" /* tmp = mnA_1 */
566+
"mulq %[n1] \n\t" /* rdx:rax = mnA_21 = n1 * mA */
567+
"addq %%rax, %[tmp] \n\t" /* tmp = mnA_1 += mnA_1_part2 */
568+
569+
"movq %[n0], %%rax \n\t" /* rax = n0_original */
570+
571+
"adcq $0, %%rdx \n\t" /* mnA_2 += carry */
572+
"subq %[tmp], %[u1] \n\t" /* u1 = v1 = u1 - mnA_1 */
573+
"sbbq %%rdx, %[u2] \n\t" /* u2 = v2 = u2 - mnA_2 - borrow */
574+
"sbbq $0, %[u3] \n\t" /* u3 = v3 = u3 - borrow */
575+
576+
"imulq %[invn0], %[u1] \n\t" /* u1 = mB = v1 * invn0 */
577+
578+
"mulq %[u1] \n\t" /* rdx:rax = mnB_21 = n0_original * mB */
579+
"movq %[u1], %%rax \n\t" /* rax = mB */
580+
"movq %%rdx, %[u1] \n\t" /* invn0 = mnB_2 */
581+
"mulq %[n1] \n\t" /* rdx:rax = mnB_32 = n1 * mB */
582+
"addq %%rax, %[u1] \n\t" /* invn0 = mnB_2 += mnB_2_part2 */
583+
"adcq $0, %%rdx \n\t" /* rdx = mnB_3 += carry */
584+
585+
"subq %[u1], %[u2] \n\t" /* t2 = v2 - mnB_2 */
586+
"sbbq %%rdx, %[u3] \n\t" /* t3 = v3 - mnB_3 - borrow */
587+
: [invn0]"+&r"(invn0), "+&a"(rrax), "=&d"(rrdx), [tmp]"+&r"(tmp),
588+
[u1]"+&r"(u1), [u2]"+&r"(u2), [u3]"+&r"(u3)
589+
: [n0]"r"(n0), [n1]"r"(n1)
590+
: "cc");
591+
T t_hi = (static_cast<T>(u3) << HALF_BITS) | u2;
592+
552593
#else
594+
// no inline-asm
595+
553596
TH mA = u0 * invn0;
554597

555598
T mnA_10 = static_cast<T>(mA) * n0;

0 commit comments

Comments
 (0)