add explanatory comments to MontgomeryDefault.h, and use is_valid_sized_uint inside impl_inverse_mod_R.h

hurchalla · hurchalla · commit 836358e45071 · 2025-01-28T14:27:29.000-08:00
diff --git a/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/MontgomeryDefault.h b/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/MontgomeryDefault.h
@@ -39,6 +39,17 @@ class MontgomeryDefault final {
                  >::type;
 };
 
+// Implementation note: when bitsT > target_bits (e.g. T == __int128_t on a 64
+// bit system), we purposely never use MontyHalfRange above and instead default
+// to MontyFullRange, because MontyFullRange uses unsigned hi_lo mults, whereas
+// MontyHalfRange uses signed hi_lo multiplications...
+// When bitsT > target_bits we're forced to use a 'slow' hi_lo mult routine,
+// since there's no simple asm instruction that's applicable- e.g. on x86_64,
+// we need far more than a single MUL or IMUL.  And unfortunately we don't have
+// a signed routine that's as good as unsigned when bitsT > target_bits.  For
+// details see the comments for slow_signed_multiply_to_hilo_product() in
+// hurchalla/util/detail/platform_specific/impl_signed_multiply_to_hilo_product.h
+
 
 }} // end namespace
 
diff --git a/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/MontyQuarterRange.h b/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/MontyQuarterRange.h
@@ -11,6 +11,7 @@
 
 #include "hurchalla/montgomery_arithmetic/detail/platform_specific/quarterrange_get_canonical.h"
 #include "hurchalla/montgomery_arithmetic/low_level_api/REDC.h"
+#include "hurchalla/montgomery_arithmetic/low_level_api/optimization_tag_structs.h"
 #include "hurchalla/montgomery_arithmetic/detail/MontyCommonBase.h"
 #include "hurchalla/modular_arithmetic/modular_addition.h"
 #include "hurchalla/modular_arithmetic/modular_subtraction.h"
@@ -293,12 +294,60 @@ class MontyQuarterRange final : public
         HPBC_POSTCONDITION2(0 < sum && sum < static_cast<T>(2*n_));
         return V(sum);
     }
+#if 1
     template <class PTAG> HURCHALLA_FORCE_INLINE
     V montyREDC(T u_hi, T u_lo, PTAG) const
     {
         bool resultIsZero;  // ignored
         return montyREDC(resultIsZero, u_hi, u_lo, PTAG());
     }
+#else
+    HURCHALLA_FORCE_INLINE
+    V montyREDC(bool& resultIsZero, T u_hi, T u_lo, LowlatencyTag) const
+    {
+        HPBC_PRECONDITION2(u_hi < n_);  // verifies that (u_hi*R + u_lo) < n*R
+        namespace hc = ::hurchalla;
+        bool isNegative;  // ignored
+#if 0
+// Enabling this section would result in the same code as the template version
+// of this function, above.  But we can reduce latency via an optimization
+// compilers don't always find, in the #else section.
+        T result = hc::REDC_incomplete(isNegative, u_hi, u_lo, n_, BC::inv_n_);
+        resultIsZero = (result == 0);
+        result = static_cast<T>(result + n_);
+#else
+        u_hi = static_cast<T>(u_hi + n_);
+        // REDC_incomplete uses u_hi only in a single subtract which sets the
+        // result, so it makes no difference for correctness in this function if
+        // we move the addition of n_ + u_hi to instead be prior to REDC.  But
+        // it will lower latency to do the add before REDC.
+        T result = hc::REDC_incomplete(isNegative, u_hi, u_lo, n_, BC::inv_n_);
+        resultIsZero = (result == n_);
+#endif
+        HPBC_POSTCONDITION2(0 < result && result < static_cast<T>(2*n_));
+        return V(result);
+    }
+    template <class PTAG> HURCHALLA_FORCE_INLINE
+    V montyREDC(T u_hi, T u_lo, PTAG) const
+    {
+        HPBC_PRECONDITION2(u_hi < n_);  // verifies that (u_hi*R + u_lo) < n*R
+        namespace hc = ::hurchalla;
+        bool isNegative;  // ignored
+#if 0
+// This is the obvious code to use, and the #else is an optimization.
+// Compilers in theory should find the optimization (latest clang and gcc both
+// do), but we enable the optimized version to be certain we get it.
+        T result = hc::REDC_incomplete(isNegative, u_hi, u_lo, n_, BC::inv_n_);
+        result = static_cast<T>(result + n_);
+#else
+        u_hi = static_cast<T>(u_hi + n_);
+        T result = hc::REDC_incomplete(isNegative, u_hi, u_lo, n_, BC::inv_n_);
+#endif
+        HPBC_POSTCONDITION2(0 < result && result < static_cast<T>(2*n_));
+        return V(result);
+    }
+#endif
+
     // return the high word of the product, and write the low word of the
     // product to u_lo.
     HURCHALLA_FORCE_INLINE T multiplyToHiLo(T& u_lo, V x, V y) const
diff --git a/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/detail/impl_inverse_mod_R.h b/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/detail/impl_inverse_mod_R.h
@@ -88,7 +88,8 @@ struct impl_inverse_mod_R {
         static_assert(ut_numeric_limits<T>::is_integer, "");
         static_assert(!(ut_numeric_limits<T>::is_signed), "");
         static_assert((bits/2)*2 == bits, "");
-        using T2 = typename std::conditional<sized_uint<bits/2>::is_valid,
+        constexpr bool is_valid_su = is_valid_sized_uint<bits/2>::value;
+        using T2 = typename std::conditional<is_valid_su,
                                     typename sized_uint<bits/2>::type, T>::type;
         HPBC_CONSTEXPR_PRECONDITION(a % 2 == 1);
 
diff --git a/msvc_build_tests.bat b/msvc_build_tests.bat
@@ -18,7 +18,7 @@ REM -G "Visual Studio 16 2019" -A x64
 REM -G "Visual Studio 16 2019" -A ARM
 REM -G "Visual Studio 16 2019" -A ARM64
 
-cmake -S. -B.\%build_dir% -DTEST_HURCHALLA_LIBS=ON -G "Visual Studio 16 2019" -A x64
+cmake -S. -B.\%build_dir% -DTEST_HURCHALLA_LIBS=ON -G "Visual Studio 17 2022" -A x64
 if %errorlevel% neq 0 exit /b %errorlevel%
 cmake --build .\%build_dir% --config Release
 if %errorlevel% neq 0 exit /b %errorlevel%