@@ -262,11 +262,6 @@ struct RedcIncomplete {
262262 defined (HURCHALLA_ALLOW_INLINE_ASM_REDC)) && \
263263 defined (HURCHALLA_TARGET_ISA_ARM_64) && !defined (_MSC_VER)
264264
265- TH u2 = static_cast <TH>(u_hi);
266- TH u3 = static_cast <TH>(u_hi >> HALF_BITS);
267- (void )u2; // avoid warning when a #if section doesn't use u2
268- (void )u3; // avoid warning when a #if section doesn't use u3
269-
270265# if 0
271266// this #if section corresponds to the first #if section of C++ code
272267// lower in this function.
@@ -277,6 +272,9 @@ struct RedcIncomplete {
277272// the #else section) to make a potential negative value positive during
278273// calculations (done by the instructions that conditionally set moz and then
279274// add moz).
275+
276+ TH u2 = static_cast <TH>(u_hi);
277+ TH u3 = static_cast <TH>(u_hi >> HALF_BITS);
280278 TH m, tmp;
281279 __asm__ (" mul %[m], %[u0], %[invn0] \n\t "
282280 " umulh %[u0], %[m], %[n0] \n\t " /* u0 = mnA_1 */
@@ -344,6 +342,50 @@ struct RedcIncomplete {
344342 subtrahend = (static_cast <T>(tmp) << HALF_BITS) | u1;
345343# endif
346344
345+ /*
346+ #elif (defined(HURCHALLA_ALLOW_INLINE_ASM_ALL) || \
347+ defined(HURCHALLA_ALLOW_INLINE_ASM_REDC)) && \
348+ defined(HURCHALLA_TARGET_ISA_X86_64) && !defined(_MSC_VER)
349+ */
350+ #elif 0
351+ TH m = u0;
352+ TH rrax = n0;
353+ TH rrdx, tmp2;
354+ __asm__ (" imulq %[invn0], %[m] \n\t " /* mA = u0 * inv_n */
355+ " mulq %[m] \n\t " /* rdx:rax = mnA_10 = rax * mA (rax == n0); high-order bits of the product in rdx */
356+ " movq %%rdx, %[tmp2] \n\t " /* tmp2 = mnA_1 */
357+ " movq %[n1], %%rax \n\t "
358+ " mulq %[m] \n\t " /* rdx:rax = mnA_21 = n1 * mA */
359+ " xorl %k[m], %k[m] \n\t " /* m = 0 */
360+ " addq %%rax, %[tmp2] \n\t " /* tmp2 = mnA_1 = mnA_1_part2 + mnA_1 */
361+ " adcq %%rdx, %[m] \n\t " /* m = mnA_2 + carry */
362+ " subq %[tmp2], %[u1] \n\t " /* u1 = v1 = u1 - mnA_1 */
363+ " imulq %[u1], %[invn0] \n\t " /* invn0 = mB = v1 * invn0 */
364+
365+ " movq %[n0], %%rax \n\t "
366+ " mulq %[invn0] \n\t " /* rdx:rax = mnB_21 = n0 * mB */
367+ " movq %%rax, %[u1] \n\t " /* u1 = mnB_1 */
368+ " movq %%rdx, %[n0] \n\t " /* n0 = mnB_2 */
369+
370+ " movq %[n1], %%rax \n\t "
371+ " mulq %[invn0] \n\t " /* rdx:rax = mnB_32 = n1 * mB */
372+ " xorl %k[invn0], %k[invn0] \n\t " /* invn0 = 0 */
373+ " addq %%rax, %[n0] \n\t " /* n0 = mnB_2 = mnB_2_part2 + mnB_2 */
374+ " adcq %%rdx, %[invn0] \n\t " /* invn0 = mnB_3 = mnB_3 + carry */
375+
376+ " xorl %%eax, %%eax \n\t " /* rax = 0 */
377+ " addq %[u1], %[tmp2] \n\t " /* tmp2 = dummy = mnA_1 + mnB_1 */
378+ " adcq %[n0], %[m] \n\t " /* m = sum2 = mnB_2 + mnA_2 + carry */
379+ " adcq %%rax, %[invn0], hs \n\t " /* tmp = sum3 = mnB_3 += carry */
380+ : [m]" +&r" (m), [invn0]" +&r" (invn0),
381+ " +&a" (rrax), " =&d" (rrdx), [tmp2]" =&r" (tmp2), [n1]" +&r" (n1), [u1]" +&r" (u1),
382+ [n0]" +&r" (n0)
383+ :
384+ : " cc" );
385+ minuend = u_hi;
386+ subtrahend = (static_cast <T>(tmp) << HALF_BITS) | u1;
387+
388+
347389#else // not using inline-asm
348390
349391 TH mA = u0 * invn0;
0 commit comments