Skip to content

Commit 836358e

Browse files
committed
add explanatory comments to MontgomeryDefault.h, and use is_valid_sized_uint inside impl_inverse_mod_R.h
1 parent 634d17a commit 836358e

File tree

4 files changed

+63
-2
lines changed

4 files changed

+63
-2
lines changed

montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/MontgomeryDefault.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,17 @@ class MontgomeryDefault final {
3939
>::type;
4040
};
4141

42+
// Implementation note: when bitsT > target_bits (e.g. T == __int128_t on a 64
43+
// bit system), we purposely never use MontyHalfRange above and instead default
44+
// to MontyFullRange, because MontyFullRange uses unsigned hi_lo mults, whereas
45+
// MontyHalfRange uses signed hi_lo multiplications...
46+
// When bitsT > target_bits we're forced to use a 'slow' hi_lo mult routine,
47+
// since there's no simple asm instruction that's applicable- e.g. on x86_64,
48+
// we need far more than a single MUL or IMUL. And unfortunately we don't have
49+
// a signed routine that's as good as unsigned when bitsT > target_bits. For
50+
// details see the comments for slow_signed_multiply_to_hilo_product() in
51+
// hurchalla/util/detail/platform_specific/impl_signed_multiply_to_hilo_product.h
52+
4253

4354
}} // end namespace
4455

montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/MontyQuarterRange.h

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include "hurchalla/montgomery_arithmetic/detail/platform_specific/quarterrange_get_canonical.h"
1313
#include "hurchalla/montgomery_arithmetic/low_level_api/REDC.h"
14+
#include "hurchalla/montgomery_arithmetic/low_level_api/optimization_tag_structs.h"
1415
#include "hurchalla/montgomery_arithmetic/detail/MontyCommonBase.h"
1516
#include "hurchalla/modular_arithmetic/modular_addition.h"
1617
#include "hurchalla/modular_arithmetic/modular_subtraction.h"
@@ -293,12 +294,60 @@ class MontyQuarterRange final : public
293294
HPBC_POSTCONDITION2(0 < sum && sum < static_cast<T>(2*n_));
294295
return V(sum);
295296
}
297+
#if 1
296298
template <class PTAG> HURCHALLA_FORCE_INLINE
297299
V montyREDC(T u_hi, T u_lo, PTAG) const
298300
{
299301
bool resultIsZero; // ignored
300302
return montyREDC(resultIsZero, u_hi, u_lo, PTAG());
301303
}
304+
#else
305+
HURCHALLA_FORCE_INLINE
306+
V montyREDC(bool& resultIsZero, T u_hi, T u_lo, LowlatencyTag) const
307+
{
308+
HPBC_PRECONDITION2(u_hi < n_); // verifies that (u_hi*R + u_lo) < n*R
309+
namespace hc = ::hurchalla;
310+
bool isNegative; // ignored
311+
#if 0
312+
// Enabling this section would result in the same code as the template version
313+
// of this function, above. But we can reduce latency via an optimization
314+
// compilers don't always find, in the #else section.
315+
T result = hc::REDC_incomplete(isNegative, u_hi, u_lo, n_, BC::inv_n_);
316+
resultIsZero = (result == 0);
317+
result = static_cast<T>(result + n_);
318+
#else
319+
u_hi = static_cast<T>(u_hi + n_);
320+
// REDC_incomplete uses u_hi only in a single subtract which sets the
321+
// result, so it makes no difference for correctness in this function if
322+
// we move the addition of n_ + u_hi to instead be prior to REDC. But
323+
// it will lower latency to do the add before REDC.
324+
T result = hc::REDC_incomplete(isNegative, u_hi, u_lo, n_, BC::inv_n_);
325+
resultIsZero = (result == n_);
326+
#endif
327+
HPBC_POSTCONDITION2(0 < result && result < static_cast<T>(2*n_));
328+
return V(result);
329+
}
330+
template <class PTAG> HURCHALLA_FORCE_INLINE
331+
V montyREDC(T u_hi, T u_lo, PTAG) const
332+
{
333+
HPBC_PRECONDITION2(u_hi < n_); // verifies that (u_hi*R + u_lo) < n*R
334+
namespace hc = ::hurchalla;
335+
bool isNegative; // ignored
336+
#if 0
337+
// This is the obvious code to use, and the #else is an optimization.
338+
// Compilers in theory should find the optimization (latest clang and gcc both
339+
// do), but we enable the optimized version to be certain we get it.
340+
T result = hc::REDC_incomplete(isNegative, u_hi, u_lo, n_, BC::inv_n_);
341+
result = static_cast<T>(result + n_);
342+
#else
343+
u_hi = static_cast<T>(u_hi + n_);
344+
T result = hc::REDC_incomplete(isNegative, u_hi, u_lo, n_, BC::inv_n_);
345+
#endif
346+
HPBC_POSTCONDITION2(0 < result && result < static_cast<T>(2*n_));
347+
return V(result);
348+
}
349+
#endif
350+
302351
// return the high word of the product, and write the low word of the
303352
// product to u_lo.
304353
HURCHALLA_FORCE_INLINE T multiplyToHiLo(T& u_lo, V x, V y) const

montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/detail/impl_inverse_mod_R.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,8 @@ struct impl_inverse_mod_R {
8888
static_assert(ut_numeric_limits<T>::is_integer, "");
8989
static_assert(!(ut_numeric_limits<T>::is_signed), "");
9090
static_assert((bits/2)*2 == bits, "");
91-
using T2 = typename std::conditional<sized_uint<bits/2>::is_valid,
91+
constexpr bool is_valid_su = is_valid_sized_uint<bits/2>::value;
92+
using T2 = typename std::conditional<is_valid_su,
9293
typename sized_uint<bits/2>::type, T>::type;
9394
HPBC_CONSTEXPR_PRECONDITION(a % 2 == 1);
9495

msvc_build_tests.bat

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ REM -G "Visual Studio 16 2019" -A x64
1818
REM -G "Visual Studio 16 2019" -A ARM
1919
REM -G "Visual Studio 16 2019" -A ARM64
2020

21-
cmake -S. -B.\%build_dir% -DTEST_HURCHALLA_LIBS=ON -G "Visual Studio 16 2019" -A x64
21+
cmake -S. -B.\%build_dir% -DTEST_HURCHALLA_LIBS=ON -G "Visual Studio 17 2022" -A x64
2222
if %errorlevel% neq 0 exit /b %errorlevel%
2323
cmake --build .\%build_dir% --config Release
2424
if %errorlevel% neq 0 exit /b %errorlevel%

0 commit comments

Comments
 (0)