add inverse function to MontgomeryForm, and add tests for it.

hurchalla · hurchalla · commit 7215f7b8edea · 2025-09-07T02:00:49.000-07:00
diff --git a/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/MontgomeryForm.h b/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/MontgomeryForm.h
@@ -488,6 +488,7 @@ class MontgomeryForm final {
                 detail::montgomery_array_pow<MontyTag,
                                    MontgomeryForm>::pow(*this, bases, exponent);
         return result[0];
+        //return detail::montgomery_pow<MontgomeryForm>::scalarpow(*this, base, exponent);
     }
 
     // Calculates and returns the modular exponentiation of 2 (converted into a
@@ -498,11 +499,10 @@ class MontgomeryForm final {
     MontgomeryValue two_pow(T exponent) const
     {
         HPBC_CLOCKWORK_API_PRECONDITION(exponent >= 0);
-        MontgomeryValue result =
-                              detail::montgomery_two_pow::call(*this, exponent);
-        HPBC_CLOCKWORK_POSTCONDITION(getCanonicalValue(result) ==
+        MontgomeryValue ret = detail::montgomery_two_pow::call(*this, exponent);
+        HPBC_CLOCKWORK_POSTCONDITION(getCanonicalValue(ret) ==
                                 getCanonicalValue(pow(convertIn(2), exponent)));
-        return result;
+        return ret;
     }
 
     // This is a specially optimized version of the pow() function above.
@@ -536,6 +536,28 @@ class MontgomeryForm final {
     }
 
 
+    // Returns the multiplicative inverse of 'x' in the Montgomery domain if
+    // the inverse exists. If the inverse does not exist, it returns zero (or
+    // more precisely, it returns the value equal to getZeroValue()).
+    // This is a convenience function to stay in the Montgomery domain when you
+    // want to find the multiplicative inverse of a MontgomeryValue.
+    //
+    // Performance note: this function has no performance advantage over
+    // hurchalla::modular_multiplicative_inverse if you need the inverse of a
+    // number in standard integer domain - i.e. don't convert into Montgomery
+    // domain just to call this function. However, when you intend to stay in
+    // the Montgomery domain, this function is the fastest way to get the
+    // multiplicative inverse.
+    template <class PTAG = LowlatencyTag> HURCHALLA_FORCE_INLINE
+    CanonicalValue inverse(MontgomeryValue x) const
+    {
+        CanonicalValue ret = impl.template inverse<PTAG>(x);
+        HPBC_CLOCKWORK_POSTCONDITION(ret == getZeroValue() ||
+                        getCanonicalValue(multiply(x, ret)) == getUnityValue());
+        return ret;
+    }
+
+
     // Returns the "greatest common divisor" of the standard representations
     // (non-montgomery) of both x and the modulus, using the gcd functor that
     // you supply. The functor must take two integral arguments of the same type
diff --git a/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/ImplMontgomeryForm.contents b/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/ImplMontgomeryForm.contents
@@ -228,6 +228,12 @@ public:
         return impl.fusedSquareAdd(x, cv, PTAG());
     }
 
+    template <class PTAG> HURCHALLA_IMF_MAYBE_FORCE_INLINE
+    CanonicalValue inverse(MontgomeryValue x) const
+    {
+        return impl.inverse(x, PTAG());
+    }
+
     template <class F> HURCHALLA_IMF_MAYBE_FORCE_INLINE
     T gcd_with_modulus(MontgomeryValue x, const F& gcd_functor) const
     {
diff --git a/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/MontyCommonBase.h b/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/MontyCommonBase.h
@@ -11,6 +11,8 @@
 
 #include "hurchalla/montgomery_arithmetic/low_level_api/REDC.h"
 #include "hurchalla/modular_arithmetic/detail/optimization_tag_structs.h"
+#include "hurchalla/modular_arithmetic/modular_multiplication.h"
+#include "hurchalla/modular_arithmetic/modular_multiplicative_inverse.h"
 #include "hurchalla/montgomery_arithmetic/low_level_api/get_Rsquared_mod_n.h"
 #include "hurchalla/montgomery_arithmetic/low_level_api/get_R_mod_n.h"
 #include "hurchalla/montgomery_arithmetic/low_level_api/inverse_mod_R.h"
@@ -383,6 +385,38 @@ class MontyCommonBase {
     }
 
 
+    template <class PTAG>   // Performance TAG (see optimization_tag_structs.h)
+    HURCHALLA_FORCE_INLINE C inverse(V x, PTAG) const
+    {
+        // Given x == a*R, we do 2 REDCs to get a*R^(-1), then we call
+        // the standard integer domain inverse() to get a^(-1)*R.
+
+        namespace hc = ::hurchalla;
+        const D* child = static_cast<const D*>(this);
+        HPBC_CLOCKWORK_PRECONDITION2(child->isValid(x));
+        T u_hi = 0;
+        // get a Natural number (i.e. number >= 0) congruent to x (mod n)
+        T u_lo = static_cast<const D*>(this)->getNaturalEquivalence(x);
+        V result = child->montyREDC(u_hi, u_lo, PTAG());
+
+        u_hi = 0;
+        u_lo = static_cast<const D*>(this)->getNaturalEquivalence(result);
+        V result2 = child->montyREDC(u_hi, u_lo, PTAG());
+
+        T result3 = static_cast<const D*>(this)->getNaturalEquivalence(result2);
+        T gcd;  // ignored
+        T inv = hc::modular_multiplicative_inverse(result3, n_, gcd);
+
+        HPBC_CLOCKWORK_POSTCONDITION2(inv < n_);
+        //POSTCONDITION: Return 0 if the inverse does not exist. Otherwise
+        //   return the value of the inverse (which would never be 0, given that
+        //   the modulus n_ > 1).
+        HPBC_CLOCKWORK_POSTCONDITION2(inv == 0 ||
+           hc::modular_multiplication_prereduced_inputs(result3, inv, n_) == 1);
+        return C(inv);
+    }
+
+
     // Returns the greatest common divisor of the standard representations
     // (non-montgomery) of both x and the modulus, using the supplied functor.
     // The functor must take two integral arguments of the same type and return
diff --git a/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/MontyWrappedStandardMath.h b/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/MontyWrappedStandardMath.h
@@ -13,6 +13,7 @@
 #include "hurchalla/montgomery_arithmetic/low_level_api/get_R_mod_n.h"
 #include "hurchalla/montgomery_arithmetic/detail/MontyTags.h"
 #include "hurchalla/modular_arithmetic/modular_multiplication.h"
+#include "hurchalla/modular_arithmetic/modular_multiplicative_inverse.h"
 #include "hurchalla/modular_arithmetic/modular_addition.h"
 #include "hurchalla/modular_arithmetic/modular_subtraction.h"
 #include "hurchalla/modular_arithmetic/absolute_value_difference.h"
@@ -278,6 +279,22 @@ class MontyWrappedStandardMath final {
         return sv;
     }
 
+    template <class PTAG>   // Performance TAG (see optimization_tag_structs.h)
+    HURCHALLA_FORCE_INLINE C inverse(V x, PTAG) const
+    {
+        namespace hc = ::hurchalla;
+        HPBC_CLOCKWORK_PRECONDITION2(isCanonical(x));
+        T gcd;  // ignored
+        T inv = hc::modular_multiplicative_inverse(x.get(), modulus_, gcd);
+
+        HPBC_CLOCKWORK_POSTCONDITION2(inv < modulus_);
+        //POSTCONDITION: Return 0 if the inverse does not exist. Otherwise
+        //   return the value of the inverse (which would never be 0, given that
+        //   modulus_ > 1).
+        HPBC_CLOCKWORK_POSTCONDITION2(inv == 0 || 1 ==
+            hc::modular_multiplication_prereduced_inputs(inv,x.get(),modulus_));
+        return C(inv);
+    }
 
     // Returns the greatest common divisor of the standard representations
     // (non-montgomery) of both x and the modulus, using the supplied functor.
diff --git a/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/unit_testing_helpers/AbstractMontgomeryForm.h b/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/unit_testing_helpers/AbstractMontgomeryForm.h
@@ -216,6 +216,9 @@ class AbstractMontgomeryForm {
     virtual MontgomeryValue fusedSquareAdd(MontgomeryValue x, CanonicalValue cv,
         bool useLowlatencyTag) const = 0;
 
+    virtual CanonicalValue inverse(MontgomeryValue x,
+        bool useLowlatencyTag) const = 0;
+
     virtual std::vector<MontgomeryValue> vectorPow(
         const std::vector<MontgomeryValue>& bases, IntegerType exponent) const = 0;
 
@@ -301,6 +304,12 @@ class AbstractMontgomeryForm {
         return fusedSquareAdd(x, cv, std::is_same<PTAG, LowlatencyTag>::value);
     }
 
+    template <class PTAG = LowlatencyTag>
+    CanonicalValue inverse(MontgomeryValue x) const
+    {
+        return inverse(x, std::is_same<PTAG, LowlatencyTag>::value);
+    }
+
     template <std::size_t NUM_BASES>
     std::array<MontgomeryValue, NUM_BASES>
     pow(const std::array<MontgomeryValue, NUM_BASES>& bases, IntegerType exponent) const
diff --git a/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/unit_testing_helpers/AbstractMontgomeryWrapper.h b/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/unit_testing_helpers/AbstractMontgomeryWrapper.h
@@ -118,6 +118,10 @@ class AbstractMontgomeryWrapper final {
     MontgomeryValue fusedSquareAdd(MontgomeryValue x, CanonicalValue cv) const
         { return pimpl->template fusedSquareAdd<PTAG>(x, cv); }
 
+    template <class PTAG = LowlatencyTag>
+    CanonicalValue inverse(MontgomeryValue x) const
+        { return pimpl->template inverse<PTAG>(x); }
+
     MontgomeryValue pow(MontgomeryValue base, IntegerType exponent) const
         { return pimpl->pow(base, exponent); }
 
diff --git a/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/unit_testing_helpers/ConcreteMontgomeryForm.h b/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/unit_testing_helpers/ConcreteMontgomeryForm.h
@@ -600,6 +600,20 @@ class ConcreteMontgomeryForm final : public AbstractMontgomeryForm<ut_numeric_li
         return OpenV(static_cast<typename OpenV::OT>(mfv.get()));
     }
 
+    virtual C inverse(V x, bool useLowlatencyTag) const override
+    {
+        OpenMFC mfc;
+        if (useLowlatencyTag) {
+            OpenMFC mfc2(mf.template inverse<LowlatencyTag>(OpenMFV(OpenV(x))));
+            mfc = mfc2;
+        } else {
+            OpenMFC mfc2(mf.template inverse<LowuopsTag>(OpenMFV(OpenV(x))));
+            mfc = mfc2;
+        }
+        // note: mfc.get() might be signed or unsigned; OpenC::OT is unsigned
+        return OpenC(static_cast<typename OpenC::OT>(mfc.get()));
+    }
+
 
     // This class (ConcreteMontgomeryForm) only supports calling vectorPow()
     // with a std::vector that has size equal to one of the sizes given by the
diff --git a/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/platform_specific/montgomery_pow.h b/montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/platform_specific/montgomery_pow.h
@@ -168,14 +168,32 @@ struct montgomery_pow {
     }
     while (exponent > static_cast<T>(1)) {
         exponent = static_cast<T>(exponent >> static_cast<T>(1));
+#if 0
         std::array<V, NUM_BASES> tmp;
+
         Unroll<NUM_BASES>::call([&](std::size_t i) HURCHALLA_INLINE_LAMBDA {
             bases[i] = mf.template square<LowuopsTag>(bases[i]);
             tmp[i] = mf.template multiply<LowuopsTag>(result[i], bases[i]);
         });
         Unroll<NUM_BASES>::call([&](std::size_t i) HURCHALLA_INLINE_LAMBDA {
             result[i].cmov(exponent & static_cast<T>(1), tmp[i]);
         });
+#else
+        // see scalarpow)() comments for why this #else section might be
+        // preferable to the #if alternative above.  There's probably little
+        // difference at larger NUM_BASES though, where total uops is the
+        // bottleneck rather than dependency chain length.
+        Unroll<NUM_BASES>::call([&](std::size_t i) HURCHALLA_INLINE_LAMBDA {
+            bases[i] = mf.template square<LowuopsTag>(bases[i]);
+        });
+
+        V mont_one = mf.getUnityValue();
+        Unroll<NUM_BASES>::call([&](std::size_t i) HURCHALLA_INLINE_LAMBDA {
+            V tmp = mont_one;
+            tmp.cmov(exponent & static_cast<T>(1), bases[i]);
+            result[i] = mf.template multiply<LowlatencyTag>(result[i], tmp);
+        });
+#endif
     }
     return result;
   }
@@ -198,12 +216,24 @@ struct montgomery_pow {
     }
     while (exponent > static_cast<T>(1)) {
         exponent = static_cast<T>(exponent >> 1u);
+#if 0
         Unroll<NUM_BASES>::call([&](std::size_t i) HURCHALLA_INLINE_LAMBDA {
             bases[i] = mf.template square<LowuopsTag>(bases[i]);
             V tmp = mf.template multiply<LowuopsTag>(result[i], bases[i]);
             result[i].template
                       cmov<CSelectMaskedTag>(exponent & static_cast<T>(1), tmp);
         });
+#else
+        // the comments in arraypow_cmov() apply equally in this section
+        V mont_one = mf.getUnityValue();
+        Unroll<NUM_BASES>::call([&](std::size_t i) HURCHALLA_INLINE_LAMBDA {
+            bases[i] = mf.template square<LowuopsTag>(bases[i]);
+            V tmp = mont_one;
+            tmp.template
+                 cmov<CSelectMaskedTag>(exponent & static_cast<T>(1), bases[i]);
+            result[i] = mf.template multiply<LowlatencyTag>(result[i], tmp);
+        });
+#endif
     }
     return result;
   }
diff --git a/test/montgomery_arithmetic/test_MontgomeryForm.h b/test/montgomery_arithmetic/test_MontgomeryForm.h
@@ -293,6 +293,40 @@ void test_remainder(const M& mf)
     EXPECT_TRUE(mf.remainder(static_cast<T>(mid+1)) == ((mid+1) % modulus));
 }
 
+template <typename M>
+void test_single_inverse(const M& mf, typename M::IntegerType a)
+{
+    namespace hc = ::hurchalla;
+    using T = typename M::IntegerType;
+    using U = typename hc::extensible_make_unsigned<T>::type;
+
+    U n = static_cast<U>(mf.getModulus());
+    U gcd;  // ignored
+    auto answer = hc::modular_multiplicative_inverse(static_cast<U>(a), n, gcd);
+    U val = static_cast<U>(mf.convertOut(mf.inverse(mf.convertIn(a))));
+    EXPECT_TRUE(val == answer);
+}
+
+template <typename M>
+void test_inverse(const M& mf)
+{
+    using T = typename M::IntegerType;
+    T max = ::hurchalla::ut_numeric_limits<T>::max();
+    T mid = static_cast<T>(max/2);
+    T modulus = mf.getModulus();
+    test_single_inverse(mf, static_cast<T>(0));
+    test_single_inverse(mf, static_cast<T>(1));
+    test_single_inverse(mf, static_cast<T>(2));
+    test_single_inverse(mf, static_cast<T>(max-0));
+    test_single_inverse(mf, static_cast<T>(max-1));
+    test_single_inverse(mf, static_cast<T>(mid-0));
+    test_single_inverse(mf, static_cast<T>(mid-1));
+    test_single_inverse(mf, static_cast<T>(modulus-1));
+    test_single_inverse(mf, static_cast<T>(modulus-2));
+    test_single_inverse(mf, static_cast<T>(modulus/2));
+    test_single_inverse(mf, static_cast<T>((modulus/2) - 1));
+}
+
 template <typename M>
 void test_mf_general_checks(const M& mf, typename M::IntegerType a,
                            typename M::IntegerType b, typename M::IntegerType c)
@@ -739,16 +773,28 @@ void test_MontgomeryForm()
         EXPECT_TRUE(mf.gcd_with_modulus(mf.convertIn(12), GcdFunctor()) == 3);
     }
 
-    // test remainder()
+    // test remainder() and inverse()
     {
         T max = max_modulus;
         T mid = static_cast<T>(max/2);
         mid = (mid % 2 == 0) ? static_cast<T>(mid + 1) : mid;
-        test_remainder(MFactory::construct(3));    // smallest possible modulus
-        test_remainder(MFactory::construct(max));  // largest possible modulus
-        if (121 <= max)
-            test_remainder(MFactory::construct(121));
-        test_remainder(MFactory::construct(mid));
+        auto mf_3 = MFactory::construct(3);
+        test_remainder(mf_3);    // smallest possible modulus
+        test_inverse(mf_3);
+
+        auto mf_max = MFactory::construct(max);
+        test_remainder(mf_max);  // largest possible modulus
+        test_inverse(mf_max);
+
+        if (121 <= max) {
+            auto mf_121 = MFactory::construct(121);
+            test_remainder(mf_121);
+            test_inverse(mf_121);
+        }
+
+        auto mf_mid = MFactory::construct(mid);
+        test_remainder(mf_mid);
+        test_inverse(mf_mid);
     }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -228,6 +228,12 @@ public:`
`228`	`228`	`return impl.fusedSquareAdd(x, cv, PTAG());`
`229`	`229`	`}`
`230`	`230`
	`231`	`+ template <class PTAG> HURCHALLA_IMF_MAYBE_FORCE_INLINE`
	`232`	`+ CanonicalValue inverse(MontgomeryValue x) const`
	`233`	`+ {`
	`234`	`+ return impl.inverse(x, PTAG());`
	`235`	`+ }`
	`236`	`+`
`231`	`237`	`template <class F> HURCHALLA_IMF_MAYBE_FORCE_INLINE`
`232`	`238`	`T gcd_with_modulus(MontgomeryValue x, const F& gcd_functor) const`
`233`	`239`	`{`