hurchalla
diff --git a/‎modular_arithmetic/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎modular_arithmetic/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎montgomery_arithmetic/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎montgomery_arithmetic/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_two_pow/experimental_montgomery_two_pow.h‎
Lines changed: 165 additions & 0 deletions b/‎montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_two_pow/experimental_montgomery_two_pow.h‎
Lines changed: 165 additions & 0 deletions
diff --git a/‎montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_two_pow/testbench_montgomery_two_pow.cpp‎
Lines changed: 16 additions & 23 deletions b/‎montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_two_pow/testbench_montgomery_two_pow.cpp‎
Lines changed: 16 additions & 23 deletions
@@ -75,7 +75,7 @@ include(FetchContent)
 FetchContent_Declare(
     hurchalla_util
     GIT_REPOSITORY https://github.com/hurchalla/util.git
-    GIT_TAG        9fac434b586717052c648339eb0f0f89d23e0298
+    GIT_TAG        ea4d95c8852d8351cbd1529bbb48a9c10e7d61bf
 )
 FetchContent_MakeAvailable(hurchalla_util)
 
 
@@ -79,7 +79,7 @@ include(FetchContent)
 FetchContent_Declare(
     hurchalla_util
     GIT_REPOSITORY https://github.com/hurchalla/util.git
-    GIT_TAG        9fac434b586717052c648339eb0f0f89d23e0298
+    GIT_TAG        ea4d95c8852d8351cbd1529bbb48a9c10e7d61bf
 )
 FetchContent_MakeAvailable(hurchalla_util)
 
 
@@ -2726,6 +2726,171 @@ goto break_0_39;
         }
         result = mf.multiply(result, val1);
         return result;
+} else if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 40) {
+        // optimization of code section 28
+        // that replaces 'shift' with 'bits_remaining' in order to obtain more
+        // efficient shifts.  It may or may not make a difference for speed...
+
+        if (n <= MASK) {
+            C cR1 = MFE::getMontvalueR(mf);
+            V result = MFE::twoPowLimited_times_x(mf, static_cast<size_t>(n), cR1);
+            return result;
+        }
+        HPBC_CLOCKWORK_ASSERT2(n > MASK);
+
+        HPBC_CLOCKWORK_ASSERT2(n > 0);
+        int leading_zeros = count_leading_zeros(n);
+        int bits_remaining = ut_numeric_limits<decltype(n)>::digits - leading_zeros;
+        HPBC_CLOCKWORK_ASSERT2(bits_remaining > P2);
+
+        U n2 = branchless_shift_left(n, leading_zeros);
+
+        // calculate the constexpr var 'high_word_shift' - when we right shift a
+        // type U variable by this amount, we'll get the size_t furthest most
+        // left bits of the type U variable.  Note that we assume that a right
+        // shift by high_word_shift will be zero cost, since the shift is just a
+        // way to access the CPU register that has the most significant bits -
+        // unless the compiler is really dumb and misses this optimization,
+        // which I haven't seen happen and which would surprise me.
+        constexpr int size_t_digits = ut_numeric_limits<size_t>::digits;
+        constexpr int digits_U = ut_numeric_limits<U>::digits;
+        constexpr int digits_bigger = (digits_U > size_t_digits) ? digits_U : size_t_digits;
+        constexpr int digits_smaller = (digits_U < size_t_digits) ? digits_U : size_t_digits;
+        constexpr int high_word_shift = digits_bigger - size_t_digits;
+
+        size_t index = static_cast<size_t>(n2 >> high_word_shift) >> (digits_smaller - P2);
+        n2 = static_cast<U>(n2 << P2);
+        HPBC_CLOCKWORK_ASSERT2(index <= MASK);
+        // normally we use (index & MASK), but it's redundant with index <= MASK
+        C cR1 = MFE::getMontvalueR(mf);
+        V result = MFE::twoPowLimited_times_x_v2(mf, index + 1, cR1);
+
+        bits_remaining -= P2;
+
+        while (bits_remaining >= P2) {
+            if HURCHALLA_CPP17_CONSTEXPR (USE_SQUARING_VALUE_OPTIMIZATION) {
+                SV sv = MFE::getSquaringValue(mf, result);
+                static_assert(P2 > 0, "");
+                HURCHALLA_REQUEST_UNROLL_LOOP for (int i=0; i<P2 - 1; ++i)
+                    sv = MFE::squareSV(mf, sv);
+                result = MFE::squareToMontgomeryValue(mf, sv);
+            } else {
+                HURCHALLA_REQUEST_UNROLL_LOOP for (int i=0; i<P2; ++i)
+                    result = mf.square(result);
+            }
+
+            bits_remaining -= P2;
+            index = static_cast<size_t>(n2 >> high_word_shift) >> (digits_smaller - P2);
+            n2 = static_cast<U>(n2 << P2);
+            C tmp = mf.getCanonicalValue(result);
+            result = MFE::twoPowLimited_times_x_v2(mf, index + 1, tmp);
+        }
+        result = mf.halve(result);
+
+        if (bits_remaining == 0)
+            return result;
+        HPBC_CLOCKWORK_ASSERT2(0 < bits_remaining && bits_remaining < P2);
+
+        index = static_cast<size_t>(n2 >> high_word_shift) >> (digits_smaller - bits_remaining);
+        V tableVal = MFE::twoPowLimited_times_x(mf, index, cR1);
+
+        if HURCHALLA_CPP17_CONSTEXPR (USE_SQUARING_VALUE_OPTIMIZATION) {
+            SV sv = MFE::getSquaringValue(mf, result);
+            HPBC_CLOCKWORK_ASSERT2(bits_remaining >= 1);
+            for (int i=0; i<bits_remaining-1; ++i)
+                sv = MFE::squareSV(mf, sv);
+            result = MFE::squareToMontgomeryValue(mf, sv);
+        }
+        else {
+            for (int i=0; i<bits_remaining; ++i)
+                result = mf.square(result);
+        }
+        result = mf.multiply(result, tableVal);
+        return result;
+} else if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 41) {
+        // optimization of code section 29
+        // that replaces 'shift' with 'bits_remaining' in order to obtain more
+        // efficient shifts.  It may or may not make a difference for speed...
+
+        if (n <= MASK) {
+            C cR1 = MFE::getMontvalueR(mf);
+            V result = MFE::twoPowLimited_times_x(mf, static_cast<size_t>(n), cR1);
+            return result;
+        }
+        HPBC_CLOCKWORK_ASSERT2(n > MASK);
+
+        HPBC_CLOCKWORK_ASSERT2(n > 0);
+        int leading_zeros = count_leading_zeros(n);
+        int bits_remaining = ut_numeric_limits<decltype(n)>::digits - leading_zeros;
+        HPBC_CLOCKWORK_ASSERT2(bits_remaining > P2);
+
+        U n2 = branchless_shift_left(n, leading_zeros);
+
+        // calculate the constexpr var 'high_word_shift' - when we right shift a
+        // type U variable by this amount, we'll get the size_t furthest most
+        // left bits of the type U variable.  Note that we assume that a right
+        // shift by high_word_shift will be zero cost, since the shift is just a
+        // way to access the CPU register that has the most significant bits -
+        // unless the compiler is really dumb and misses this optimization,
+        // which I haven't seen happen and which would surprise me.
+        constexpr int size_t_digits = ut_numeric_limits<size_t>::digits;
+        constexpr int digits_U = ut_numeric_limits<U>::digits;
+        constexpr int digits_bigger = (digits_U > size_t_digits) ? digits_U : size_t_digits;
+        constexpr int digits_smaller = (digits_U < size_t_digits) ? digits_U : size_t_digits;
+        constexpr int high_word_shift = digits_bigger - size_t_digits;
+
+        C cresult = MFE::getMontvalueR(mf);
+
+        HPBC_CLOCKWORK_ASSERT2(bits_remaining > P2);
+        // we check against P2 + P2 because we always process P2 more bits after
+        // the loop ends -- so we need to ensure we'll actually have
+        // (bits_remaining >= P2) after the loop ends.
+        while (bits_remaining >= P2 + P2) {
+            size_t index = static_cast<size_t>(n2 >> high_word_shift) >> (digits_smaller - P2);
+            n2 = static_cast<U>(n2 << P2);
+            V result = MFE::twoPowLimited_times_x_v2(mf, index + 1, cresult);
+
+            if HURCHALLA_CPP17_CONSTEXPR (USE_SQUARING_VALUE_OPTIMIZATION) {
+                SV sv = MFE::getSquaringValue(mf, result);
+                static_assert(P2 > 0, "");
+                HURCHALLA_REQUEST_UNROLL_LOOP for (int i=0; i<P2 - 1; ++i)
+                    sv = MFE::squareSV(mf, sv);
+                result = MFE::squareToMontgomeryValue(mf, sv);
+            } else {
+                HURCHALLA_REQUEST_UNROLL_LOOP for (int i=0; i<P2; ++i)
+                    result = mf.square(result);
+            }
+            cresult = mf.getCanonicalValue(result);
+
+            bits_remaining -= P2;
+        }
+        HPBC_CLOCKWORK_ASSERT2(P2 <= bits_remaining && bits_remaining < P2 + P2);
+
+        size_t index = static_cast<size_t>(n2 >> high_word_shift) >> (digits_smaller - P2);
+        n2 = static_cast<U>(n2 << P2);
+        V result = MFE::twoPowLimited_times_x(mf, index, cresult);
+        bits_remaining -= P2;
+        if (bits_remaining == 0)
+            return result;
+        HPBC_CLOCKWORK_ASSERT2(0 < bits_remaining && bits_remaining < P2);
+
+        index = static_cast<size_t>(n2 >> high_word_shift) >> (digits_smaller - bits_remaining);
+        C cR1 = MFE::getMontvalueR(mf);
+        V tableVal = MFE::twoPowLimited_times_x(mf, index, cR1);
+
+        if HURCHALLA_CPP17_CONSTEXPR (USE_SQUARING_VALUE_OPTIMIZATION) {
+            SV sv = MFE::getSquaringValue(mf, result);
+            HPBC_CLOCKWORK_ASSERT2(bits_remaining >= 1);
+            for (int i=0; i<bits_remaining-1; ++i)
+                sv = MFE::squareSV(mf, sv);
+            result = MFE::squareToMontgomeryValue(mf, sv);
+        }
+        else {
+            for (int i=0; i<bits_remaining; ++i)
+                result = mf.square(result);
+        }
+        result = mf.multiply(result, tableVal);
+        return result;
 }
     }
     else if HURCHALLA_CPP17_CONSTEXPR (TABLESIZE == 2) {
 
@@ -945,7 +945,7 @@ using namespace hurchalla;
    std::cout << "\nbegin benchmarks - array two_pow\n";
 
    // warm up call
-   bench_array_two_pow<1, 8, 8, MontType, false>(static_cast<U>(maxU - range), range, dummy, max_modulus_bits_reduce, seed, exponent_bits_reduce);
+   bench_array_two_pow<0, 30, 6, MontType, false>(static_cast<U>(maxU - range), range, dummy, max_modulus_bits_reduce, seed, exponent_bits_reduce);
 
       // format is bench_array_two_pow<TABLE_BITS, CODE_SECTION, ARRAY_SIZE, MontType, USE_SQUARING_VALUE_OPTIMIZATION>(...)
 
@@ -954,21 +954,7 @@ using namespace hurchalla;
    for (size_t i=0; i<4; ++i) {
      for (size_t j=0; j<timingA[i].size(); ++j) {
 
-      timingA[i][j].push_back(
-         bench_array_two_pow<1, 8, 3, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
-      timingA[i][j].push_back(
-         bench_array_two_pow<1, 8, 4, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
-      timingA[i][j].push_back(
-         bench_array_two_pow<1, 8, 5, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
-      timingA[i][j].push_back(
-         bench_array_two_pow<1, 8, 6, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
-      timingA[i][j].push_back(
-         bench_array_two_pow<1, 8, 7, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
-      timingA[i][j].push_back(
-         bench_array_two_pow<1, 8, 8, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
-
-
-#if 0
+#if 1
       timingA[i][j].push_back(
          bench_array_two_pow<0, 27, 3, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
       timingA[i][j].push_back(
@@ -1149,7 +1135,7 @@ using namespace hurchalla;
    }
 #endif
 
-#if 0
+#if 1
       timingA[i][j].push_back(
          bench_array_two_pow<0, 0, 10, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
       timingA[i][j].push_back(
@@ -1784,7 +1770,7 @@ std::cout << "Timings By Test Type:\n";
 
    //  warm up to get cpu boost (or throttle) going
    for (size_t i=0; i<1; ++i)
-      bench_range<1, false, 0, MontType, false>(static_cast<U>(maxU - range), range, dummy, max_modulus_bits_reduce, seed, exponent_bits_reduce);
+      bench_range<0, false, 34, MontType, true>(static_cast<U>(maxU - range), range, dummy, max_modulus_bits_reduce, seed, exponent_bits_reduce);
 
 //   std::array<std::vector<Timing>, 4> timings;
 
@@ -1795,8 +1781,6 @@ std::cout << "Timings By Test Type:\n";
 
        // format is bench_range<TABLE_BITS, USE_SLIDING_WINDOW_OPTIMIZATION, CODE_SECTION,
        //                       MontType, USE_SQUARING_VALUE_OPTIMIZATION>
-      timings[i][j].push_back(
-         bench_range<1, false, 0, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
 
 #if 0
 // This is a copy/paste of the "best of best" code sections from further below (nothing is new here).
@@ -2016,8 +2000,7 @@ std::cout << "Timings By Test Type:\n";
 
 
 
-
-#if 0
+#if 1
       timings[i][j].push_back(
          bench_range<0, true , 17, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
       timings[i][j].push_back(
@@ -2092,6 +2075,11 @@ std::cout << "Timings By Test Type:\n";
       timings[i][j].push_back(
          bench_range<0, false, 39, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
 
+      timings[i][j].push_back(
+         bench_range<0, false, 40, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
+      timings[i][j].push_back(
+         bench_range<0, false, 41, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
+
       timings[i][j].push_back(
          bench_range<0, true , 19, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
       timings[i][j].push_back(
@@ -2242,6 +2230,11 @@ std::cout << "Timings By Test Type:\n";
       timings[i][j].push_back(
          bench_range<0, false, 39, MontType, true>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
 
+      timings[i][j].push_back(
+         bench_range<0, false, 40, MontType, true>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
+      timings[i][j].push_back(
+         bench_range<0, false, 41, MontType, true>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
+
       timings[i][j].push_back(
          bench_range<0, true , 19, MontType, true>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
       timings[i][j].push_back(
@@ -2327,7 +2320,7 @@ std::cout << "Timings By Test Type:\n";
          bench_range<4, true , 1, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
 #endif
 
-#if 0
+#if 1
       timings[i][j].push_back(
          bench_range<4, true , 0, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
       timings[i][j].push_back(
Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@ include(FetchContent)`
`75`	`75`	`FetchContent_Declare(`
`76`	`76`	`hurchalla_util`
`77`	`77`	`GIT_REPOSITORY https://github.com/hurchalla/util.git`
`78`		`- GIT_TAG 9fac434b586717052c648339eb0f0f89d23e0298`
	`78`	`+ GIT_TAG ea4d95c8852d8351cbd1529bbb48a9c10e7d61bf`
`79`	`79`	`)`
`80`	`80`	`FetchContent_MakeAvailable(hurchalla_util)`
`81`	`81`
Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ include(FetchContent)`
`79`	`79`	`FetchContent_Declare(`
`80`	`80`	`hurchalla_util`
`81`	`81`	`GIT_REPOSITORY https://github.com/hurchalla/util.git`
`82`		`- GIT_TAG 9fac434b586717052c648339eb0f0f89d23e0298`
	`82`	`+ GIT_TAG ea4d95c8852d8351cbd1529bbb48a9c10e7d61bf`
`83`	`83`	`)`
`84`	`84`	`FetchContent_MakeAvailable(hurchalla_util)`
`85`	`85`