hurchalla
diff --git a/‎build_tests.sh‎
Lines changed: 4 additions & 4 deletions b/‎build_tests.sh‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎modular_arithmetic/include/hurchalla/modular_arithmetic/detail/platform_specific/impl_modular_addition.h‎
Lines changed: 24 additions & 166 deletions b/‎modular_arithmetic/include/hurchalla/modular_arithmetic/detail/platform_specific/impl_modular_addition.h‎
Lines changed: 24 additions & 166 deletions
diff --git a/‎modular_arithmetic/include/hurchalla/modular_arithmetic/detail/platform_specific/impl_modular_multiplication.h‎
Lines changed: 2 additions & 4 deletions b/‎modular_arithmetic/include/hurchalla/modular_arithmetic/detail/platform_specific/impl_modular_multiplication.h‎
Lines changed: 2 additions & 4 deletions
@@ -571,10 +571,10 @@ fi
 
 
 if [ "$run_tests" = true ]; then
-  ./$build_dir/test_ndebug_programming_by_contract --gtest_break_on_failure
-  exit_on_failure
-  ./$build_dir/test_programming_by_contract --gtest_break_on_failure
-  exit_on_failure
+#  ./$build_dir/test_ndebug_programming_by_contract --gtest_break_on_failure
+#  exit_on_failure
+#  ./$build_dir/test_programming_by_contract --gtest_break_on_failure
+#  exit_on_failure
   ./$build_dir/test_hurchalla_util --gtest_break_on_failure
   exit_on_failure
   ./$build_dir/test_hurchalla_modular_arithmetic --gtest_break_on_failure
 
@@ -9,7 +9,6 @@
 #define HURCHALLA_MODULAR_ARITHMETIC_IMPL_MODULAR_ADDITION_H_INCLUDED
 
 
-#include "hurchalla/modular_arithmetic/detail/optimization_tag_structs.h"
 #include "hurchalla/util/traits/extensible_make_unsigned.h"
 #include "hurchalla/util/traits/ut_numeric_limits.h"
 #include "hurchalla/util/conditional_select.h"
@@ -21,52 +20,17 @@
 namespace hurchalla { namespace detail {
 
 
-// With regard to the LowlatencyTag vs. the LowuopsTag functions:
-// If neither 'b' nor 'modulus' was set/modified recently before the call of
-// this modular addition function, then the LowlatencyTag versions will likely
-// provide lower latency than the LowuopsTag versions.  Note that LowlatencyTag
-// will typically use more uops and create more pressure on the ALU than
-// LowuopsTag, unless the compiler can loop hoist the extra instruction(s)
-// involving 'b' and 'modulus'.
-
 // Fyi: the purpose of having structs with static member functions is to
 // disallow ADL and to make specializations simple and easy.
 
-// primary template for default implementation
-template <class PTAG>
-struct default_impl_modadd_unsigned {
-};
-
-// --- Version #0 (for low uops and low ALU use) ---
-template <>
-struct default_impl_modadd_unsigned<LowuopsTag> {
-  template <typename T>
-  HURCHALLA_FORCE_INLINE static T call(T a, T b, T modulus)
-  {
-    static_assert(ut_numeric_limits<T>::is_integer, "");
-    static_assert(!(ut_numeric_limits<T>::is_signed), "");
-    HPBC_PRECONDITION2(modulus>0);
-    HPBC_PRECONDITION2(a<modulus);  // i.e. the input must be prereduced
-    HPBC_PRECONDITION2(b<modulus);  // i.e. the input must be prereduced
-
-    T sum = static_cast<T>(a + b);
-    T result = static_cast<T>(sum - modulus);
-      // result = (sum < modulus) ? sum : result;
-    result = ::hurchalla::conditional_select(sum < modulus, sum, result);
-
-    HPBC_POSTCONDITION2(static_cast<T>(0) <= result && result < modulus);
-    return result;
-  }
-};
-
 #if !defined(__clang__)
 // Note: only clang (on x64 and arm32/64) seems to produce optimal code from
 // the theoretically preferable Version 2 further below.  And so for gcc(x64 and
 // arm32/64 and risc-V), MSVC (x64 and arm64), and icc, Version #1 here tends to
 // compile to better machine code in practice.  Gcc in particular tends to
 // generate conditional branches for Version 2, which we don't want.
 
-// --- LowlatencyTag Version #1 ---
+// --- Version #1 ---
 // This is a relatively straightforward and easy to understand version.
 //
 // However, on x86, Version #1's use of (modulus - b) will often require two
@@ -85,8 +49,7 @@ struct default_impl_modadd_unsigned<LowuopsTag> {
 // between the two function versions.  We would generally expect the latency of
 // the two versions to be the same, but as always this depends on whether the
 // compiler generates good (or not good) machine code.
-template <>
-struct default_impl_modadd_unsigned<LowlatencyTag> {
+struct default_impl_modadd_unsigned {
   template <typename T>
   HURCHALLA_FORCE_INLINE static T call(T a, T b, T modulus)
   {
@@ -113,13 +76,12 @@ struct default_impl_modadd_unsigned<LowlatencyTag> {
 };
 #else
 
-// --- LowlatencyTag Version #2 ---
+// --- Version #2 ---
 // This is a more difficult to understand default implementation version.  The
 // proof of this function's correctness is given by the theorem in the comments
 // at the end of this file.  See the notes at the end of those comments to
 // understand the implementation details.
-template <>
-struct default_impl_modadd_unsigned<LowlatencyTag> {
+struct default_impl_modadd_unsigned {
   template <typename T>
   HURCHALLA_FORCE_INLINE static T call(T a, T b, T modulus)
   {
@@ -145,17 +107,17 @@ struct default_impl_modadd_unsigned<LowlatencyTag> {
 
 
 // primary template
-template <typename T, class PTAG>
+template <typename T>
 struct impl_modular_addition_unsigned {
   HURCHALLA_FORCE_INLINE static T call(T a, T b, T modulus)
   {
-    return default_impl_modadd_unsigned<PTAG>::call(a, b, modulus);
+    return default_impl_modadd_unsigned::call(a, b, modulus);
   }
 };
 
 
 // These inline asm functions implement optimizations of the default function
-// versions #0 and #2 (above), for LowuopsTag and LowlatencyTag respectively.
+// version #2 (above).
 
 // MSVC doesn't support inline asm, so we skip it.
 
@@ -164,13 +126,13 @@ struct impl_modular_addition_unsigned {
     defined(HURCHALLA_TARGET_ISA_X86_64) && !defined(_MSC_VER)
 
 
-// These LowlatencyTag functions contain the calculation "b - modulus".  If
-// neither 'b' nor 'modulus' was recently set/modified, then "b - modulus"
-// will usually be calculated at the same time as earlier work by the CPU, or
-// in a loop it could potentially be loop hoisted by the compiler.  Either way,
-// this potentially allows lower latency than the LowuopsTag version.
+// Note: these functions contain the calculation "b - modulus".  If neither 'b'
+// nor 'modulus' was recently set/modified, then "b - modulus" will usually be
+// calculated at the same time as earlier work by the CPU, or in a loop it could
+// potentially be loop hoisted by the compiler.  This is what provides a
+// potential for lowered latency.
 template <>
-struct impl_modular_addition_unsigned<std::uint32_t, LowlatencyTag> {
+struct impl_modular_addition_unsigned<std::uint32_t> {
   HURCHALLA_FORCE_INLINE static
   std::uint32_t call(std::uint32_t a, std::uint32_t b, std::uint32_t modulus)
   {
@@ -201,13 +163,13 @@ struct impl_modular_addition_unsigned<std::uint32_t, LowlatencyTag> {
 
     HPBC_POSTCONDITION2(result < modulus);  // uint32_t guarantees result>=0.
     HPBC_POSTCONDITION2(result ==
-              default_impl_modadd_unsigned<LowlatencyTag>::call(a, b, modulus));
+                             default_impl_modadd_unsigned::call(a, b, modulus));
     return result;
   }
 };
 
 template <>
-struct impl_modular_addition_unsigned<std::uint64_t, LowlatencyTag> {
+struct impl_modular_addition_unsigned<std::uint64_t> {
   HURCHALLA_FORCE_INLINE static
   std::uint64_t call(std::uint64_t a, std::uint64_t b, std::uint64_t modulus)
   {
@@ -232,14 +194,14 @@ struct impl_modular_addition_unsigned<std::uint64_t, LowlatencyTag> {
 
     HPBC_POSTCONDITION2(result < modulus);  // uint64_t guarantees result>=0.
     HPBC_POSTCONDITION2(result ==
-              default_impl_modadd_unsigned<LowlatencyTag>::call(a, b, modulus));
+                             default_impl_modadd_unsigned::call(a, b, modulus));
     return result;
   }
 };
 
 #ifdef HURCHALLA_ENABLE_INLINE_ASM_128_BIT
 template <>
-struct impl_modular_addition_unsigned<__uint128_t, LowlatencyTag> {
+struct impl_modular_addition_unsigned<__uint128_t> {
   HURCHALLA_FORCE_INLINE static
   __uint128_t call(__uint128_t a, __uint128_t b, __uint128_t modulus)
   {
@@ -271,109 +233,7 @@ struct impl_modular_addition_unsigned<__uint128_t, LowlatencyTag> {
 
     HPBC_POSTCONDITION2(result < modulus);  // __uint128_t guarantees result>=0.
     HPBC_POSTCONDITION2(result ==
-              default_impl_modadd_unsigned<LowlatencyTag>::call(a, b, modulus));
-    return result;
-  }
-};
-#endif
-
-
-// These LowuopsTag versions should have the lowest ALU use (one add, one sub),
-// and often the lowest uop count.
-template <>
-struct impl_modular_addition_unsigned<std::uint32_t, LowuopsTag> {
-  HURCHALLA_FORCE_INLINE static
-  std::uint32_t call(std::uint32_t a, std::uint32_t b, std::uint32_t modulus)
-  {
-    using std::uint32_t;
-    HPBC_PRECONDITION2(modulus>0);
-    HPBC_PRECONDITION2(a<modulus);  // uint32_t guarantees a>=0.
-    HPBC_PRECONDITION2(b<modulus);  // uint32_t guarantees b>=0.
-
-    uint32_t sum = static_cast<uint32_t>(a + b);
-    uint32_t tmp = sum;
-    __asm__ ("subl %[m], %[sum] \n\t"         /* tmp2 = sum - m */
-             "cmovbl %[tmp], %[sum] \n\t"     /* sum = (sum<m) ? tmp : tmp2 */
-             : [sum]"+&r"(sum)
-# if defined(__clang__)        /* https://bugs.llvm.org/show_bug.cgi?id=20197 */
-             : [m]"r"(modulus), [tmp]"r"(tmp)
-# else
-             : [m]"rm"(modulus), [tmp]"rm"(tmp)
-# endif
-             : "cc");
-    uint32_t result = sum;
-
-    HPBC_POSTCONDITION2(result < modulus);  // uint32_t guarantees result>=0.
-    HPBC_POSTCONDITION2(result ==
-                 default_impl_modadd_unsigned<LowuopsTag>::call(a, b, modulus));
-    return result;
-  }
-};
-
-template <>
-struct impl_modular_addition_unsigned<std::uint64_t, LowuopsTag> {
-  HURCHALLA_FORCE_INLINE static
-  std::uint64_t call(std::uint64_t a, std::uint64_t b, std::uint64_t modulus)
-  {
-    using std::uint64_t;
-    HPBC_PRECONDITION2(modulus>0);
-    HPBC_PRECONDITION2(a<modulus);  // uint64_t guarantees a>=0.
-    HPBC_PRECONDITION2(b<modulus);  // uint64_t guarantees b>=0.
-
-    uint64_t sum = static_cast<uint64_t>(a + b);
-    uint64_t tmp = sum;
-    __asm__ ("subq %[m], %[sum] \n\t"         /* tmp2 = sum - m */
-             "cmovbq %[tmp], %[sum] \n\t"     /* sum = (sum<m) ? tmp : tmp2 */
-             : [sum]"+&r"(sum)
-# if defined(__clang__)        /* https://bugs.llvm.org/show_bug.cgi?id=20197 */
-             : [m]"r"(modulus), [tmp]"r"(tmp)
-# else
-             : [m]"rm"(modulus), [tmp]"rm"(tmp)
-# endif
-             : "cc");
-    uint64_t result = sum;
-
-    HPBC_POSTCONDITION2(result < modulus);  // uint64_t guarantees result>=0.
-    HPBC_POSTCONDITION2(result ==
-                 default_impl_modadd_unsigned<LowuopsTag>::call(a, b, modulus));
-    return result;
-  }
-};
-
-#ifdef HURCHALLA_ENABLE_INLINE_ASM_128_BIT
-template <>
-struct impl_modular_addition_unsigned<__uint128_t, LowuopsTag> {
-  HURCHALLA_FORCE_INLINE static
-  __uint128_t call(__uint128_t a, __uint128_t b, __uint128_t modulus)
-  {
-    using std::uint64_t;
-    HPBC_PRECONDITION2(modulus>0);
-    HPBC_PRECONDITION2(a<modulus);  // __uint128_t guarantees a>=0.
-    HPBC_PRECONDITION2(b<modulus);  // __uint128_t guarantees b>=0.
-
-    __uint128_t sum = static_cast<__uint128_t>(a + b);
-    uint64_t sumlo = static_cast<uint64_t>(sum);
-    uint64_t sumhi = static_cast<uint64_t>(sum >> 64);
-    uint64_t tmplo = sumlo;
-    uint64_t tmphi = sumhi;
-    uint64_t mlo = static_cast<uint64_t>(modulus);
-    uint64_t mhi = static_cast<uint64_t>(modulus >> 64);
-    __asm__ ("subq %[mlo], %[sumlo] \n\t"       /* tmp2 = sum - m */
-             "sbbq %[mhi], %[sumhi] \n\t"
-             "cmovbq %[tmplo], %[sumlo] \n\t"   /* sum = (sum<m) ? tmp : tmp2 */
-             "cmovbq %[tmphi], %[sumhi] \n\t"
-             : [sumlo]"+&r"(sumlo), [sumhi]"+&r"(sumhi)
-# if defined(__clang__)        /* https://bugs.llvm.org/show_bug.cgi?id=20197 */
-             : [mlo]"r"(mlo), [mhi]"r"(mhi), [tmplo]"r"(tmplo), [tmphi]"r"(tmphi)
-# else
-             : [mlo]"rm"(mlo), [mhi]"rm"(mhi), [tmplo]"rm"(tmplo), [tmphi]"rm"(tmphi)
-# endif
-             : "cc");
-    __uint128_t result = (static_cast<__uint128_t>(sumhi) << 64) | sumlo;
-
-    HPBC_POSTCONDITION2(result < modulus);  // __uint128_t guarantees result>=0.
-    HPBC_POSTCONDITION2(result ==
-              default_impl_modadd_unsigned<LowuopsTag>::call(a, b, modulus));
+                             default_impl_modadd_unsigned::call(a, b, modulus));
     return result;
   }
 };
@@ -383,23 +243,21 @@ struct impl_modular_addition_unsigned<__uint128_t, LowuopsTag> {
 #endif
 
 
-// You must use either LowlatencyTag or LowuopsTag for PTAG.  See comment at
-// top of this file for details.
 
 // version for unsigned T
-template <typename T, class PTAG, bool = ut_numeric_limits<T>::is_signed>
+template <typename T, bool = ut_numeric_limits<T>::is_signed>
 struct impl_modular_addition {
   HURCHALLA_FORCE_INLINE static T call(T a, T b, T modulus)
   {
     static_assert(ut_numeric_limits<T>::is_integer, "");
     static_assert(!(ut_numeric_limits<T>::is_signed), "");
-    return impl_modular_addition_unsigned<T,PTAG>::call(a, b, modulus);
+    return impl_modular_addition_unsigned<T>::call(a, b, modulus);
   }
 };
 
 // version for signed T
-template <typename T, class PTAG>
-struct impl_modular_addition<T, PTAG, true> {
+template <typename T>
+struct impl_modular_addition<T, true> {
   HURCHALLA_FORCE_INLINE static T call(T a, T b, T modulus)
   {
     static_assert(ut_numeric_limits<T>::is_integer, "");
@@ -424,10 +282,10 @@ struct impl_modular_addition<T, PTAG, true> {
     U mask = static_cast<U>(tmp >> ut_numeric_limits<T>::digits);
     U masked_modulus = static_cast<U>(mask & static_cast<U>(modulus));
     U result = static_cast<U>(static_cast<U>(tmp) + masked_modulus);
-    HPBC_ASSERT2(result == impl_modular_addition_unsigned<U,PTAG>::call(
+    HPBC_ASSERT2(result == impl_modular_addition_unsigned<U>::call(
                 static_cast<U>(a), static_cast<U>(b), static_cast<U>(modulus)));
 #else
-    U result = impl_modular_addition_unsigned<U,PTAG>::call(static_cast<U>(a),
+    U result = impl_modular_addition_unsigned<U>::call(static_cast<U>(a),
                                     static_cast<U>(b), static_cast<U>(modulus));
 #endif
     return static_cast<T>(result);
 
@@ -68,12 +68,10 @@ struct slow_modular_multiplication {
     T result = 0;
     while (b > 0) {
         namespace hc = ::hurchalla;
-        T tmp = hc::modular_addition_prereduced_inputs<T,LowlatencyTag>(
-                                                            a, result, modulus);
+        T tmp = hc::modular_addition_prereduced_inputs(a, result, modulus);
           // result = (b&1) ? tmp : result
         result = hc::conditional_select((b & 1u), tmp, result);
-        a = hc::modular_addition_prereduced_inputs<T,LowlatencyTag>(
-                                                                 a, a, modulus);
+        a = hc::modular_addition_prereduced_inputs(a, a, modulus);
         b = static_cast<T>(b >> 1);
     }
     return result;