@@ -192,17 +192,28 @@ struct DefaultRedcStandard
192192 static HURCHALLA_FORCE_INLINE
193193 T call (T u_hi, T u_lo, T n, T inv_n)
194194 {
195+ #if defined(HURCHALLA_AVOID_CSELECT)
195196 bool ovf;
196197 T result = RedcIncomplete::call (ovf, u_hi, u_lo, n, inv_n);
197198 // By RedcIncomplete::call()'s Postcondition #1, we get
198199 // T final_result = (ovf) ? static_cast<T>(result + n) : result;
199- #if defined(HURCHALLA_AVOID_CSELECT)
200200 T mask = static_cast <T>(-static_cast <T>(ovf));
201201 T final_result = static_cast <T>(result + (mask & n));
202202#else
203+ # if 0
204+ bool ovf;
205+ T result = RedcIncomplete::call (ovf, u_hi, u_lo, n, inv_n);
203206 // final_result = (ovf) ? static_cast<T>(result+n) : result
204207 T final_result = ::hurchalla::conditional_select (
205208 ovf, static_cast <T>(result+n), result);
209+ # else
210+ using P = typename safely_promote_unsigned<T>::type;
211+ T m = static_cast <T>(static_cast <P>(u_lo) * static_cast <P>(inv_n));
212+ T mn_lo;
213+ T mn_hi = ::hurchalla::unsigned_multiply_to_hilo_product (mn_lo, m, n);
214+ T final_result = ::hurchalla::modular_subtraction_prereduced_inputs
215+ <T,::hurchalla::LowuopsTag>(u_hi, mn_hi, n);
216+ # endif
206217#endif
207218 HPBC_POSTCONDITION2 (final_result < n);
208219 return final_result;
@@ -232,6 +243,65 @@ struct RedcStandard
232243#if (defined(HURCHALLA_ALLOW_INLINE_ASM_ALL) || \
233244 defined (HURCHALLA_ALLOW_INLINE_ASM_REDC)) && \
234245 defined(HURCHALLA_TARGET_ISA_X86_64) && !defined(_MSC_VER)
246+
247+ # if (HURCHALLA_COMPILER_HAS_UINT128_T())
248+ // specialization for __uint128_t (for x86_64)
249+ template <>
250+ struct RedcStandard <__uint128_t >
251+ {
252+ using T = __uint128_t ;
253+
254+ static HURCHALLA_FORCE_INLINE
255+ T call (T u_hi, T u_lo, T n, T inv_n, LowlatencyTag)
256+ {
257+ using P = typename safely_promote_unsigned<T>::type;
258+ // see uint64_t version's comments for explanations
259+ HPBC_PRECONDITION2 (u_hi < n);
260+ HPBC_PRECONDITION2 (
261+ static_cast <T>(static_cast <P>(n) * static_cast <P>(inv_n)) == 1 );
262+ HPBC_PRECONDITION2 (n % 2 == 1 );
263+ HPBC_PRECONDITION2 (n > 1 );
264+
265+ T m = static_cast <T>(static_cast <P>(u_lo) * static_cast <P>(inv_n));
266+ T mn_lo;
267+ T mn_hi = ::hurchalla::unsigned_multiply_to_hilo_product (mn_lo, m, n);
268+ HPBC_ASSERT2 (mn_hi < n);
269+ T reg = u_hi + n;
270+
271+ using std::uint64_t ;
272+ uint64_t reglo = static_cast <uint64_t >(reg);
273+ uint64_t reghi = static_cast <uint64_t >(reg >> 64 );
274+ uint64_t uhilo = static_cast <uint64_t >(u_hi);
275+ uint64_t uhihi = static_cast <uint64_t >(u_hi >> 64 );
276+ uint64_t mnhilo = static_cast <uint64_t >(mn_hi);
277+ uint64_t mnhihi = static_cast <uint64_t >(mn_hi >> 64 );
278+ __asm__ (
279+ " subq %[mnhilo], %[reglo] \n\t " /* reg = u_hi + n - mn_hi */
280+ " sbbq %[mnhihi], %[reghi] \n\t "
281+ " subq %[mnhilo], %[uhilo] \n\t " /* t_hi = u_hi - mn_hi */
282+ " sbbq %[mnhihi], %[uhihi] \n\t "
283+ " cmovaeq %[uhilo], %[reglo] \n\t " /* reg = (u_hi >= mn_hi) ? t_hi : reg */
284+ " cmovaeq %[uhihi], %[reghi] \n\t "
285+ : [reglo]" +&r" (reglo), [reghi]" +&r" (reghi), [uhilo]" +&r" (uhilo), [uhihi]" +&r" (uhihi)
286+ : [mnhilo]" r" (mnhilo), [mnhihi]" r" (mnhihi)
287+ : " cc" );
288+ T result = (static_cast <__uint128_t >(reghi) << 64 ) | reglo;
289+ HPBC_ASSERT2 (result == DefaultRedcStandard<T>::call (u_hi, u_lo, n, inv_n));
290+ HPBC_POSTCONDITION2 (result < n);
291+ return result;
292+ }
293+
294+ static HURCHALLA_FORCE_INLINE
295+ T call (T u_hi, T u_lo, T n, T inv_n, LowuopsTag)
296+ {
297+ T result = DefaultRedcStandard<T>::call (u_hi, u_lo, n, inv_n);
298+ HPBC_POSTCONDITION2 (result < n);
299+ return result;
300+ }
301+ };
302+ # endif
303+
304+
235305// specialization for uint64_t (for x86_64)
236306template <>
237307struct RedcStandard <std::uint64_t >
@@ -286,6 +356,52 @@ struct RedcStandard<std::uint64_t>
286356 return result;
287357 }
288358};
359+
360+
361+ // specialization for uint32_t
362+ template <>
363+ struct RedcStandard <std::uint32_t >
364+ {
365+ using T = std::uint32_t ;
366+
367+ static HURCHALLA_FORCE_INLINE
368+ T call (T u_hi, T u_lo, T n, T inv_n, LowlatencyTag)
369+ {
370+ using P = typename safely_promote_unsigned<T>::type;
371+ // see uint64_t version's comments for explanations
372+ HPBC_PRECONDITION2 (u_hi < n);
373+ HPBC_PRECONDITION2 (
374+ static_cast <T>(static_cast <P>(n) * static_cast <P>(inv_n)) == 1 );
375+ HPBC_PRECONDITION2 (n % 2 == 1 );
376+ HPBC_PRECONDITION2 (n > 1 );
377+
378+ T m = static_cast <T>(static_cast <P>(u_lo) * static_cast <P>(inv_n));
379+ T mn_lo;
380+ T mn_hi = ::hurchalla::unsigned_multiply_to_hilo_product (mn_lo, m, n);
381+ HPBC_ASSERT2 (mn_hi < n);
382+ T reg = u_hi + n;
383+ T uhi = u_hi;
384+ __asm__ (
385+ " subl %[mnhi], %[reg] \n\t " /* reg = u_hi + n - mn_hi */
386+ " subl %[mnhi], %[uhi] \n\t " /* t_hi = u_hi - mn_hi */
387+ " cmovael %[uhi], %[reg] \n\t " /* reg = (u_hi >= mn_hi) ? t_hi : reg */
388+ : [reg]" +&r" (reg), [uhi]" +&r" (uhi)
389+ : [mnhi]" r" (mn_hi)
390+ : " cc" );
391+ T result = reg;
392+ HPBC_ASSERT2 (result == DefaultRedcStandard<T>::call (u_hi, u_lo, n, inv_n));
393+ HPBC_POSTCONDITION2 (result < n);
394+ return result;
395+ }
396+
397+ static HURCHALLA_FORCE_INLINE
398+ T call (T u_hi, T u_lo, T n, T inv_n, LowuopsTag)
399+ {
400+ T result = DefaultRedcStandard<T>::call (u_hi, u_lo, n, inv_n);
401+ HPBC_POSTCONDITION2 (result < n);
402+ return result;
403+ }
404+ };
289405#endif // (defined(HURCHALLA_ALLOW_INLINE_ASM_ALL) ||
290406 // defined(HURCHALLA_ALLOW_INLINE_ASM_REDC)) &&
291407 // defined(HURCHALLA_TARGET_ISA_X86_64) && !defined(_MSC_VER)
0 commit comments