Skip to content

Commit d1d7add

Browse files
committed
use branchless shifts
1 parent 0a1c971 commit d1d7add

File tree

9 files changed

+158
-140
lines changed

9 files changed

+158
-140
lines changed

modular_arithmetic/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ include(FetchContent)
7575
FetchContent_Declare(
7676
hurchalla_util
7777
GIT_REPOSITORY https://github.com/hurchalla/util.git
78-
GIT_TAG b1e9990b4293bfaaa040731fc9ec97edfeeb85cb
78+
GIT_TAG 22181fce5ae399896921a8f0538a88a9dbe41e66
7979
)
8080
FetchContent_MakeAvailable(hurchalla_util)
8181

montgomery_arithmetic/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ include(FetchContent)
7878
FetchContent_Declare(
7979
hurchalla_util
8080
GIT_REPOSITORY https://github.com/hurchalla/util.git
81-
GIT_TAG b1e9990b4293bfaaa040731fc9ec97edfeeb85cb
81+
GIT_TAG 22181fce5ae399896921a8f0538a88a9dbe41e66
8282
)
8383
FetchContent_MakeAvailable(hurchalla_util)
8484

montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/MontyCommonBase.h

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@
2121
#include "hurchalla/util/traits/ut_numeric_limits.h"
2222
#include "hurchalla/util/unsigned_multiply_to_hilo_product.h"
2323
#include "hurchalla/util/compiler_macros.h"
24+
#include "hurchalla/util/branchless_shift_left.h"
25+
#include "hurchalla/util/branchless_shift_right.h"
26+
#include "hurchalla/util/branchless_large_shift_left.h"
27+
#include "hurchalla/util/branchless_small_shift_right.h"
2428
#include "hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h"
2529
#include <type_traits>
2630

@@ -486,15 +490,24 @@ class MontyCommonBase {
486490
{
487491
static constexpr int digitsT = ut_numeric_limits<T>::digits;
488492
HPBC_CLOCKWORK_PRECONDITION2(0 <= exponent && exponent < digitsT);
493+
HPBC_CLOCKWORK_PRECONDITION2(exponent < HURCHALLA_TARGET_BIT_WIDTH);
489494
int power = digitsT - exponent;
490495
HPBC_CLOCKWORK_ASSERT2(0 < power && power <= digitsT);
491496

492497
T tmp = cx.get();
493498
HPBC_CLOCKWORK_INVARIANT2(tmp < n_);
494-
T u_lo = static_cast<T>((tmp << 1) << (power - 1));
495-
int rshift = digitsT - power;
499+
500+
HPBC_CLOCKWORK_ASSERT2(0 <= power - 1 && power - 1 < digitsT);
501+
// we know by asserttion that exponent < HURCHALLA_TARGET_BIT_WIDTH
502+
// thus, digitsT - HURCHALLA_TARGET_BIT_WIDTH < digitsT - exponent == power
503+
// and so, digitsT - HURCHALLA_TARGET_BIT_WIDTH <= power - 1
504+
HPBC_CLOCKWORK_ASSERT2(digitsT - static_cast<int>(HURCHALLA_TARGET_BIT_WIDTH) <= power - 1);
505+
T u_lo = static_cast<T>(branchless_large_shift_left(static_cast<T>(tmp << 1), power - 1));
506+
507+
int rshift = exponent;
496508
HPBC_CLOCKWORK_ASSERT2(0 <= rshift && rshift < digitsT);
497-
T u_hi = static_cast<T>(tmp >> rshift);
509+
HPBC_CLOCKWORK_ASSERT2(rshift < HURCHALLA_TARGET_BIT_WIDTH);
510+
T u_hi = static_cast<T>(branchless_small_shift_right(tmp, rshift));
498511

499512
HPBC_CLOCKWORK_ASSERT2(u_hi < n_);
500513
const D* child = static_cast<const D*>(this);
@@ -519,10 +532,10 @@ class MontyCommonBase {
519532

520533
T tmp = cx.get();
521534
HPBC_CLOCKWORK_INVARIANT2(tmp < n_);
522-
T u_lo = static_cast<T>(tmp << power);
535+
T u_lo = branchless_shift_left(tmp, power);
523536
int rshift = digitsT - power;
524537
HPBC_CLOCKWORK_ASSERT2(rshift > 0);
525-
T u_hi = (tmp >> 1) >> (rshift - 1);
538+
T u_hi = static_cast<T>(branchless_shift_right(tmp, rshift - 1) >> 1);
526539

527540
HPBC_CLOCKWORK_ASSERT2(u_hi < n_);
528541
const D* child = static_cast<const D*>(this);
@@ -539,10 +552,10 @@ class MontyCommonBase {
539552

540553
T tmp = cx.get();
541554
HPBC_CLOCKWORK_INVARIANT2(tmp < n_);
542-
T u_lo = static_cast<T>((tmp << 1) << (power - 1));
555+
T u_lo = branchless_shift_left(static_cast<T>(tmp << 1), power - 1);
543556
int rshift = digitsT - power;
544557
HPBC_CLOCKWORK_ASSERT2(0 <= rshift && rshift < digitsT);
545-
T u_hi = static_cast<T>(tmp >> rshift);
558+
T u_hi = branchless_shift_right(tmp, rshift);
546559

547560
HPBC_CLOCKWORK_ASSERT2(u_hi < n_);
548561
const D* child = static_cast<const D*>(this);
@@ -593,10 +606,10 @@ class MontyCommonBase {
593606
int power = static_cast<int>(exponent);
594607
HPBC_CLOCKWORK_PRECONDITION2(0 <= power && power < digitsT);
595608

596-
T u_lo = static_cast<T>(r_squared_mod_n_ << power);
609+
T u_lo = branchless_shift_left(r_squared_mod_n_, power);
597610
int rshift = digitsT - power;
598611
HPBC_CLOCKWORK_ASSERT2(rshift > 0);
599-
T u_hi = (r_squared_mod_n_ >> 1) >> (rshift - 1);
612+
T u_hi = branchless_shift_right(static_cast<T>(r_squared_mod_n_ >> 1), rshift - 1);
600613

601614
HPBC_CLOCKWORK_ASSERT2(u_hi < n_);
602615
const D* child = static_cast<const D*>(this);
@@ -613,10 +626,10 @@ class MontyCommonBase {
613626
int power = static_cast<int>(exponent);
614627
HPBC_CLOCKWORK_PRECONDITION2(0 <= power && power < digitsT);
615628

616-
T u_lo = static_cast<T>(magicValue << power);
629+
T u_lo = branchless_shift_left(magicValue, power);
617630
int rshift = digitsT - power;
618631
HPBC_CLOCKWORK_ASSERT2(rshift > 0);
619-
T u_hi = (magicValue >> 1) >> (rshift - 1);
632+
T u_hi = branchless_shift_right(static_cast<T>(magicValue >> 1), rshift - 1);
620633

621634
HPBC_CLOCKWORK_ASSERT2(u_hi < n_);
622635
const D* child = static_cast<const D*>(this);

montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_pow_2kary.h

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "hurchalla/util/traits/ut_numeric_limits.h"
1414
#include "hurchalla/util/count_leading_zeros.h"
1515
#include "hurchalla/util/compiler_macros.h"
16+
#include "hurchalla/util/branchless_shift_right.h"
1617
#include "hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h"
1718
#include "hurchalla/util/traits/extensible_make_unsigned.h"
1819
#include <type_traits>
@@ -158,7 +159,7 @@ struct impl_montgomery_pow_2kary {
158159
HPBC_CLOCKWORK_ASSERT(numbits > P);
159160

160161
int shift = numbits - P;
161-
U tmp = n >> shift;
162+
U tmp = branchless_shift_right(n, shift);
162163
HPBC_CLOCKWORK_ASSERT(tmp <= MASK);
163164
// normally we'd use (tmp & MASK), but it's redundant with tmp <= MASK
164165
size_t index = static_cast<size_t>(tmp);
@@ -167,7 +168,7 @@ struct impl_montgomery_pow_2kary {
167168

168169
while (shift >= P) {
169170
if (USE_SLIDING_WINDOW_OPTIMIZATION) {
170-
while (shift > P && (static_cast<size_t>(n >> (shift-1)) & 1) == 0) {
171+
while (shift > P && (static_cast<size_t>(branchless_shift_right(n, shift-1)) & 1) == 0) {
171172
result = mf.square(result);
172173
--shift;
173174
}
@@ -178,7 +179,7 @@ struct impl_montgomery_pow_2kary {
178179
result = mf.square(result);
179180

180181
shift -= P;
181-
index = static_cast<size_t>(n >> shift) & MASK;
182+
index = static_cast<size_t>(branchless_shift_right(n, shift)) & MASK;
182183
result = mf.multiply(result, table[index]);
183184
}
184185

@@ -287,9 +288,9 @@ struct impl_montgomery_pow_2kary {
287288
std::array<V, ARRAY_SIZE> result;
288289
std::array<size_t, ARRAY_SIZE> index;
289290
HURCHALLA_REQUEST_UNROLL_LOOP for (size_t j=0; j<ARRAY_SIZE; ++j) {
290-
HPBC_CLOCKWORK_ASSERT(static_cast<U>(n[j] >> shift) <= MASK);
291-
// We don't need to 'and' with MASK, because (n[j] >> shift) <= MASK.
292-
index[j] = static_cast<size_t>(n[j] >> shift);
291+
HPBC_CLOCKWORK_ASSERT(static_cast<U>(branchless_shift_right(n[j], shift)) <= MASK);
292+
// We don't need to 'and' with MASK, because (branchless_shift_right(n[j], shift)) <= MASK.
293+
index[j] = static_cast<size_t>(branchless_shift_right(n[j], shift));
293294
result[j] = table[index[j]][j];
294295
}
295296

@@ -301,7 +302,7 @@ struct impl_montgomery_pow_2kary {
301302
}
302303
shift -= P;
303304
HURCHALLA_REQUEST_UNROLL_LOOP for (size_t j=0; j<ARRAY_SIZE; ++j) {
304-
index[j] = static_cast<size_t>(n[j] >> shift) & MASK;
305+
index[j] = static_cast<size_t>(branchless_shift_right(n[j], shift)) & MASK;
305306
result[j] = mf[j].template multiply<LowuopsTag>(
306307
result[j], table[index[j]][j]);
307308
}
@@ -402,7 +403,7 @@ struct impl_montgomery_pow_2kary {
402403

403404
int shift = numbits - P;
404405
std::array<V, ARRAY_SIZE> result;
405-
size_t tmp = static_cast<size_t>(n >> shift);
406+
size_t tmp = static_cast<size_t>(branchless_shift_right(n, shift));
406407
HPBC_CLOCKWORK_ASSERT(tmp <= MASK);
407408
HURCHALLA_REQUEST_UNROLL_LOOP for (size_t j=0; j<ARRAY_SIZE; ++j) {
408409
// normally we'd use (tmp & MASK), but it's redundant with tmp <= MASK
@@ -412,7 +413,7 @@ struct impl_montgomery_pow_2kary {
412413

413414
while (shift >= P) {
414415
if (USE_SLIDING_WINDOW_OPTIMIZATION) {
415-
while (shift > P && (static_cast<size_t>(n >> (shift-1)) & 1) == 0) {
416+
while (shift > P && (static_cast<size_t>(branchless_shift_right(n, shift-1)) & 1) == 0) {
416417
HURCHALLA_REQUEST_UNROLL_LOOP for (size_t j=0; j<ARRAY_SIZE; ++j)
417418
result[j] = mf.template square<LowuopsTag>(result[j]);
418419
--shift;
@@ -423,7 +424,7 @@ struct impl_montgomery_pow_2kary {
423424
result[j] = mf.template square<LowuopsTag>(result[j]);
424425
}
425426
shift -= P;
426-
size_t index = static_cast<size_t>(n >> shift) & MASK;
427+
size_t index = static_cast<size_t>(branchless_shift_right(n, shift)) & MASK;
427428
HURCHALLA_REQUEST_UNROLL_LOOP for (size_t j=0; j<ARRAY_SIZE; ++j) {
428429
result[j] = mf.template multiply<LowuopsTag>(result[j], table[index][j]);
429430
}

montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_pow_2kary/experimental_montgomery_pow_2kary.h

Lines changed: 24 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "hurchalla/util/traits/ut_numeric_limits.h"
1515
#include "hurchalla/util/count_leading_zeros.h"
1616
#include "hurchalla/util/compiler_macros.h"
17+
#include "hurchalla/util/branchless_shift_right.h"
1718
#include "hurchalla/modular_arithmetic/detail/clockwork_programming_by_contract.h"
1819
#include "hurchalla/util/traits/extensible_make_unsigned.h"
1920
#include <type_traits>
@@ -208,15 +209,15 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
208209
HPBC_CLOCKWORK_ASSERT(numbits > P);
209210

210211
int shift = numbits - P;
211-
U tmp = n >> shift;
212+
U tmp = branchless_shift_right(n, shift);
212213
HPBC_CLOCKWORK_ASSERT(tmp <= MASK);
213214
// normally we'd use (tmp & MASK), but it's redundant with tmp <= MASK
214215
size_t index = static_cast<size_t>(tmp);
215216
result = table[index];
216217

217218
while (shift >= P) {
218219
if HURCHALLA_CPP17_CONSTEXPR (USE_SLIDING_WINDOW_OPTIMIZATION) {
219-
while (shift > P && (static_cast<size_t>(n >> (shift-1)) & 1u) == 0) {
220+
while (shift > P && (static_cast<size_t>(branchless_shift_right(n, shift-1)) & 1u) == 0) {
220221
result = mf.square(result);
221222
--shift;
222223
}
@@ -236,7 +237,7 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
236237
}
237238

238239
shift -= P;
239-
index = static_cast<size_t>(n >> shift) & MASK;
240+
index = static_cast<size_t>(branchless_shift_right(n, shift)) & MASK;
240241
result = mf.multiply(result, table[index]);
241242
}
242243

@@ -314,7 +315,7 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
314315
int shift = numbits - NUMBITS_MASKBIG;
315316

316317
HPBC_CLOCKWORK_ASSERT2(shift > 0);
317-
size_t tmp = static_cast<size_t>(n >> shift);
318+
size_t tmp = static_cast<size_t>(branchless_shift_right(n, shift));
318319
HPBC_CLOCKWORK_ASSERT2(tmp <= MASKBIG);
319320
size_t loindex = tmp & MASK;
320321
size_t hiindex = tmp >> TABLE_BITS;
@@ -323,15 +324,15 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
323324

324325
while (shift >= NUMBITS_MASKBIG) {
325326
if HURCHALLA_CPP17_CONSTEXPR (USE_SLIDING_WINDOW_OPTIMIZATION) {
326-
while (shift > NUMBITS_MASKBIG && (static_cast<size_t>(n>>(shift-1)) & 1u) == 0) {
327+
while (shift > NUMBITS_MASKBIG && (static_cast<size_t>(branchless_shift_right(n, shift-1)) & 1u) == 0) {
327328
result = mf.square(result);
328329
--shift;
329330
}
330331
}
331332
HPBC_CLOCKWORK_ASSERT2(shift >= NUMBITS_MASKBIG);
332333

333334
shift -= NUMBITS_MASKBIG;
334-
tmp = static_cast<size_t>(n >> shift);
335+
tmp = static_cast<size_t>(branchless_shift_right(n, shift));
335336
loindex = tmp & MASK;
336337
hiindex = (tmp >> TABLE_BITS) & MASK;
337338

@@ -388,7 +389,7 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
388389
shift = numbits - NUMBITS_MASKBIG;
389390
}
390391
HPBC_CLOCKWORK_ASSERT2(shift >= 0);
391-
size_t tmp = static_cast<size_t>(n >> shift);
392+
size_t tmp = static_cast<size_t>(branchless_shift_right(n, shift));
392393
HPBC_CLOCKWORK_ASSERT2(tmp <= MASKBIG);
393394

394395
size_t index1 = tmp & MASK;
@@ -457,15 +458,15 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
457458

458459
while (shift >= NUMBITS_MASKBIG) {
459460
if HURCHALLA_CPP17_CONSTEXPR (USE_SLIDING_WINDOW_OPTIMIZATION) {
460-
while (shift > NUMBITS_MASKBIG && (static_cast<size_t>(n>>(shift-1)) & 1u) == 0) {
461+
while (shift > NUMBITS_MASKBIG && (static_cast<size_t>(branchless_shift_right(n, shift-1)) & 1u) == 0) {
461462
result = mf.square(result);
462463
--shift;
463464
}
464465
}
465466
HPBC_CLOCKWORK_ASSERT2(shift >= NUMBITS_MASKBIG);
466467

467468
shift -= NUMBITS_MASKBIG;
468-
tmp = static_cast<size_t>(n >> shift);
469+
tmp = static_cast<size_t>(branchless_shift_right(n, shift));
469470

470471
index1 = tmp & MASK;
471472
index2 = (tmp >> TABLE_BITS) & MASK;
@@ -564,7 +565,7 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
564565
}
565566
HPBC_CLOCKWORK_ASSERT2(shift >= 0);
566567

567-
size_t tmp = static_cast<size_t>(n >> shift);
568+
size_t tmp = static_cast<size_t>(branchless_shift_right(n, shift));
568569
HPBC_CLOCKWORK_ASSERT2(tmp <= MASKBIG);
569570
V result = table[0][tmp & MASK];
570571

@@ -596,15 +597,15 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
596597

597598
while (shift >= NUMBITS_MASKBIG) {
598599
if HURCHALLA_CPP17_CONSTEXPR (USE_SLIDING_WINDOW_OPTIMIZATION) {
599-
while (shift > NUMBITS_MASKBIG && (static_cast<size_t>(n>>(shift-1)) & 1u) == 0) {
600+
while (shift > NUMBITS_MASKBIG && (static_cast<size_t>(branchless_shift_right(n, shift-1)) & 1u) == 0) {
600601
result = mf.square(result);
601602
--shift;
602603
}
603604
}
604605
HPBC_CLOCKWORK_ASSERT2(shift >= NUMBITS_MASKBIG);
605606

606607
shift -= NUMBITS_MASKBIG;
607-
tmp = static_cast<size_t>(n >> shift);
608+
tmp = static_cast<size_t>(branchless_shift_right(n, shift));
608609
V val1 = table[0][tmp & MASK];
609610

610611
if HURCHALLA_CPP17_CONSTEXPR (USE_SQUARING_VALUE_OPTIMIZATION) {
@@ -755,9 +756,9 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
755756

756757
int shift = numbits - P;
757758
HURCHALLA_REQUEST_UNROLL_LOOP for (size_t j=0; j<ARRAY_SIZE; ++j) {
758-
HPBC_CLOCKWORK_ASSERT(static_cast<U>(n[j] >> shift) <= MASK);
759-
// We don't need to 'and' with MASK, because (n[j] >> shift) <= MASK.
760-
size_t index = static_cast<size_t>(n[j] >> shift);
759+
HPBC_CLOCKWORK_ASSERT(static_cast<U>(branchless_shift_right(n[j], shift)) <= MASK);
760+
// We don't need to 'and' with MASK, because (branchless_shift_right(n[j], shift)) <= MASK.
761+
size_t index = static_cast<size_t>(branchless_shift_right(n[j], shift));
761762
result[j] = table[index][j];
762763
}
763764

@@ -784,7 +785,7 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
784785
}
785786

786787
HURCHALLA_REQUEST_UNROLL_LOOP for (size_t j=0; j<ARRAY_SIZE; ++j) {
787-
size_t index = static_cast<size_t>(n[j] >> shift) & MASK;
788+
size_t index = static_cast<size_t>(branchless_shift_right(n[j], shift)) & MASK;
788789
result[j] = mf[j].template multiply<LowuopsTag>(
789790
result[j], table[index][j]);
790791
}
@@ -888,7 +889,7 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
888889
HPBC_CLOCKWORK_ASSERT(numbits > P);
889890

890891
int shift = numbits - P;
891-
size_t index = static_cast<size_t>(n >> shift);
892+
size_t index = static_cast<size_t>(branchless_shift_right(n, shift));
892893
HPBC_CLOCKWORK_ASSERT(index <= MASK);
893894
HURCHALLA_REQUEST_UNROLL_LOOP for (size_t j=0; j<ARRAY_SIZE; ++j) {
894895
// normally we'd use (index & MASK), but it's redundant with index <= MASK
@@ -898,7 +899,7 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
898899

899900
while (shift >= P) {
900901
if (USE_SLIDING_WINDOW_OPTIMIZATION) {
901-
while (shift > P && (static_cast<size_t>(n >> (shift-1)) & 1u) == 0) {
902+
while (shift > P && (static_cast<size_t>(branchless_shift_right(n, shift-1)) & 1u) == 0) {
902903
HURCHALLA_REQUEST_UNROLL_LOOP for (size_t j=0; j<ARRAY_SIZE; ++j)
903904
result[j] = mf.template square<LowuopsTag>(result[j]);
904905
--shift;
@@ -924,7 +925,7 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
924925
}
925926

926927
shift -= P;
927-
index = static_cast<size_t>(n >> shift) & MASK;
928+
index = static_cast<size_t>(branchless_shift_right(n, shift)) & MASK;
928929
HURCHALLA_REQUEST_UNROLL_LOOP for (size_t j=0; j<ARRAY_SIZE; ++j) {
929930
result[j] = mf.template multiply<LowuopsTag>(result[j], table[index][j]);
930931
}
@@ -1009,9 +1010,9 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
10091010

10101011
int shift = numbits - P;
10111012
HPBC_CLOCKWORK_ASSERT(shift >= 0);
1012-
HPBC_CLOCKWORK_ASSERT((n >> shift) <= MASK);
1013+
HPBC_CLOCKWORK_ASSERT((branchless_shift_right(n, shift)) <= MASK);
10131014
// due to above assert, we don't need to 'and' with MASK
1014-
size_t index = static_cast<size_t>(n >> shift);
1015+
size_t index = static_cast<size_t>(branchless_shift_right(n, shift));
10151016

10161017
// because the highest set bit of n is by definition a 1, we know
10171018
HPBC_CLOCKWORK_ASSERT((index >> (P-1)) == 1u); // and thus
@@ -1027,7 +1028,7 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
10271028

10281029
while (shift >= P) {
10291030
if (USE_SLIDING_WINDOW_OPTIMIZATION) {
1030-
while (shift > P && (static_cast<size_t>(n >> (shift-1)) & 1u) == 0) {
1031+
while (shift > P && (static_cast<size_t>(branchless_shift_right(n, shift-1)) & 1u) == 0) {
10311032
HURCHALLA_REQUEST_UNROLL_LOOP for (size_t j=0; j<ARRAY_SIZE; ++j)
10321033
result[j] = mf.template square<LowuopsTag>(result[j]);
10331034
--shift;
@@ -1055,7 +1056,7 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
10551056
}
10561057

10571058
shift -= P;
1058-
index = static_cast<size_t>(n >> shift) & MASK;
1059+
index = static_cast<size_t>(branchless_shift_right(n, shift)) & MASK;
10591060

10601061
HURCHALLA_REQUEST_UNROLL_LOOP for (size_t j=0; j<ARRAY_SIZE; ++j) {
10611062
V tmp = (index % 2 == 0) ? table[index/2][j] : result[j];

montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_pow_2kary/testbench_montgomery_pow_2kary.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
* License, v. 2.0. If a copy of the MPL was not distributed with this
55
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
66
*/
7+
78
#include "hurchalla/montgomery_arithmetic/low_level_api/detail/platform_specific/impl_array_get_Rsquared_mod_n.h"
89
#include "hurchalla/montgomery_arithmetic/MontgomeryForm.h"
910
#include "hurchalla/montgomery_arithmetic/montgomery_form_aliases.h"

0 commit comments

Comments
 (0)