|
11 | 11 |
|
12 | 12 | #include "hurchalla/montgomery_arithmetic/detail/platform_specific/quarterrange_get_canonical.h" |
13 | 13 | #include "hurchalla/montgomery_arithmetic/low_level_api/REDC.h" |
| 14 | +#include "hurchalla/montgomery_arithmetic/low_level_api/optimization_tag_structs.h" |
14 | 15 | #include "hurchalla/montgomery_arithmetic/detail/MontyCommonBase.h" |
15 | 16 | #include "hurchalla/modular_arithmetic/modular_addition.h" |
16 | 17 | #include "hurchalla/modular_arithmetic/modular_subtraction.h" |
@@ -293,12 +294,60 @@ class MontyQuarterRange final : public |
293 | 294 | HPBC_POSTCONDITION2(0 < sum && sum < static_cast<T>(2*n_)); |
294 | 295 | return V(sum); |
295 | 296 | } |
| 297 | +#if 1 |
296 | 298 | template <class PTAG> HURCHALLA_FORCE_INLINE |
297 | 299 | V montyREDC(T u_hi, T u_lo, PTAG) const |
298 | 300 | { |
299 | 301 | bool resultIsZero; // ignored |
300 | 302 | return montyREDC(resultIsZero, u_hi, u_lo, PTAG()); |
301 | 303 | } |
| 304 | +#else |
| 305 | + HURCHALLA_FORCE_INLINE |
| 306 | + V montyREDC(bool& resultIsZero, T u_hi, T u_lo, LowlatencyTag) const |
| 307 | + { |
| 308 | + HPBC_PRECONDITION2(u_hi < n_); // verifies that (u_hi*R + u_lo) < n*R |
| 309 | + namespace hc = ::hurchalla; |
| 310 | + bool isNegative; // ignored |
| 311 | +#if 0 |
| 312 | +// Enabling this section would result in the same code as the template version |
| 313 | +// of this function, above. But we can reduce latency via an optimization |
| 314 | +// compilers don't always find, in the #else section. |
| 315 | + T result = hc::REDC_incomplete(isNegative, u_hi, u_lo, n_, BC::inv_n_); |
| 316 | + resultIsZero = (result == 0); |
| 317 | + result = static_cast<T>(result + n_); |
| 318 | +#else |
| 319 | + u_hi = static_cast<T>(u_hi + n_); |
| 320 | + // REDC_incomplete uses u_hi only in a single subtract which sets the |
| 321 | + // result, so it makes no difference for correctness in this function if |
| 322 | + // we move the addition of n_ + u_hi to instead be prior to REDC. But |
| 323 | + // it will lower latency to do the add before REDC. |
| 324 | + T result = hc::REDC_incomplete(isNegative, u_hi, u_lo, n_, BC::inv_n_); |
| 325 | + resultIsZero = (result == n_); |
| 326 | +#endif |
| 327 | + HPBC_POSTCONDITION2(0 < result && result < static_cast<T>(2*n_)); |
| 328 | + return V(result); |
| 329 | + } |
| 330 | + template <class PTAG> HURCHALLA_FORCE_INLINE |
| 331 | + V montyREDC(T u_hi, T u_lo, PTAG) const |
| 332 | + { |
| 333 | + HPBC_PRECONDITION2(u_hi < n_); // verifies that (u_hi*R + u_lo) < n*R |
| 334 | + namespace hc = ::hurchalla; |
| 335 | + bool isNegative; // ignored |
| 336 | +#if 0 |
| 337 | +// This is the obvious code to use, and the #else is an optimization. |
| 338 | +// Compilers in theory should find the optimization (latest clang and gcc both |
| 339 | +// do), but we enable the optimized version to be certain we get it. |
| 340 | + T result = hc::REDC_incomplete(isNegative, u_hi, u_lo, n_, BC::inv_n_); |
| 341 | + result = static_cast<T>(result + n_); |
| 342 | +#else |
| 343 | + u_hi = static_cast<T>(u_hi + n_); |
| 344 | + T result = hc::REDC_incomplete(isNegative, u_hi, u_lo, n_, BC::inv_n_); |
| 345 | +#endif |
| 346 | + HPBC_POSTCONDITION2(0 < result && result < static_cast<T>(2*n_)); |
| 347 | + return V(result); |
| 348 | + } |
| 349 | +#endif |
| 350 | + |
302 | 351 | // return the high word of the product, and write the low word of the |
303 | 352 | // product to u_lo. |
304 | 353 | HURCHALLA_FORCE_INLINE T multiplyToHiLo(T& u_lo, V x, V y) const |
|
0 commit comments