Skip to content

Commit 7161f04

Browse files
committed
improve benches for mont 2^k-ary scalar pow
1 parent 3ce2a86 commit 7161f04

File tree

2 files changed

+52
-57
lines changed

2 files changed

+52
-57
lines changed

montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_pow_2kary/experimental_montgomery_pow_2kary.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -790,7 +790,7 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
790790
SV sv = MFE::getSquaringValue(mf, result);
791791
int i=0;
792792
for (size_t k=1; i + static_cast<int>(TABLE_BITS) < shift;
793-
i += TABLE_BITS, ++k) {
793+
i += static_cast<int>(TABLE_BITS), ++k) {
794794
HURCHALLA_REQUEST_UNROLL_LOOP for (size_t h=0; h<TABLE_BITS; ++h)
795795
sv = MFE::squareSV(mf, sv);
796796
size_t index = (tmp >> (k * TABLE_BITS)) & MASK;
@@ -806,7 +806,7 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
806806
else {
807807
int i=0;
808808
for (size_t k=1; i + static_cast<int>(TABLE_BITS) < shift;
809-
i += TABLE_BITS, ++k) {
809+
i += static_cast<int>(TABLE_BITS), ++k) {
810810
HURCHALLA_REQUEST_UNROLL_LOOP for (size_t h=0; h<TABLE_BITS; ++h)
811811
result = mf.square(result);
812812
size_t index = (tmp >> (k * TABLE_BITS)) & MASK;
@@ -1024,7 +1024,7 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
10241024
SV sv = MFE::getSquaringValue(mf, result);
10251025
int i=0;
10261026
for (size_t k=1; i + static_cast<int>(TABLE_BITS) < bits_remaining;
1027-
i += TABLE_BITS, ++k) {
1027+
i += static_cast<int>(TABLE_BITS), ++k) {
10281028
HURCHALLA_REQUEST_UNROLL_LOOP for (size_t h=0; h<TABLE_BITS; ++h)
10291029
sv = MFE::squareSV(mf, sv);
10301030
size_t index = (tmp >> (k * TABLE_BITS)) & MASK;
@@ -1040,7 +1040,7 @@ if HURCHALLA_CPP17_CONSTEXPR (CODE_SECTION == 0) {
10401040
else {
10411041
int i=0;
10421042
for (size_t k=1; i + static_cast<int>(TABLE_BITS) < bits_remaining;
1043-
i += TABLE_BITS, ++k) {
1043+
i += static_cast<int>(TABLE_BITS), ++k) {
10441044
HURCHALLA_REQUEST_UNROLL_LOOP for (size_t h=0; h<TABLE_BITS; ++h)
10451045
result = mf.square(result);
10461046
size_t index = (tmp >> (k * TABLE_BITS)) & MASK;

montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_pow_2kary/testbench_montgomery_pow_2kary.cpp

Lines changed: 48 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1174,7 +1174,7 @@ bench_range(U min, U range, U& totalU, unsigned int max_modulus_bits_reduce, ST
11741174
#else
11751175
auto val = hurchalla::experimental::experimental_montgomery_pow_2kary::call<
11761176
MontType, U, USE_SLIDING_WINDOW_OPTIMIZATION, TABLE_BITS, CODE_SECTION,
1177-
USE_SQUARING_VALUE_OPTIMIZATION>(mf, mont_base, static_cast<U>(exponent));
1177+
USE_SQUARING_VALUE_OPTIMIZATION>(mf, mont_base, exponent);
11781178
#endif
11791179

11801180
#if 0
@@ -1320,8 +1320,8 @@ void bench_PA_2(std::vector<TimingPA>& vecTimingPA,
13201320

13211321

13221322

1323-
template <size_t ARRAY_SIZE, class MontType, typename U, typename ST>
1324-
void bench_PA_no_sv(std::vector<TimingPA>& vecTimingPA,
1323+
template <class PTAG, size_t ARRAY_SIZE, class MontType, typename U, typename ST>
1324+
void bench_PA_PTAG(std::vector<TimingPA>& vecTimingPA,
13251325
U maxU, U range, U& dummy, unsigned int mmbr, ST seed, unsigned int ebr)
13261326
{
13271327
namespace hc = ::hurchalla;
@@ -1331,56 +1331,49 @@ void bench_PA_no_sv(std::vector<TimingPA>& vecTimingPA,
13311331
// typename U, typename ST> TimingPA bench_partial_array_pow(...)
13321332

13331333
vecTimingPA.push_back(
1334-
bench_partial_array_pow<hc::LowuopsTag, 2, 0, ARRAY_SIZE, MontType, false, false>(static_cast<U>(maxU - range), range, dummy, mmbr, seed, ebr));
1334+
bench_partial_array_pow<PTAG, 2, 0, ARRAY_SIZE, MontType, false, false>(static_cast<U>(maxU - range), range, dummy, mmbr, seed, ebr));
13351335
vecTimingPA.push_back(
1336-
bench_partial_array_pow<hc::LowuopsTag, 2, 3, ARRAY_SIZE, MontType, false, false>(static_cast<U>(maxU - range), range, dummy, mmbr, seed, ebr));
1336+
bench_partial_array_pow<PTAG, 2, 3, ARRAY_SIZE, MontType, false, false>(static_cast<U>(maxU - range), range, dummy, mmbr, seed, ebr));
13371337
vecTimingPA.push_back(
1338-
bench_partial_array_pow<hc::LowuopsTag, 2, 4, ARRAY_SIZE, MontType, false, false>(static_cast<U>(maxU - range), range, dummy, mmbr, seed, ebr));
1338+
bench_partial_array_pow<PTAG, 2, 4, ARRAY_SIZE, MontType, false, false>(static_cast<U>(maxU - range), range, dummy, mmbr, seed, ebr));
13391339
vecTimingPA.push_back(
1340-
bench_partial_array_pow<hc::LowuopsTag, 2, 5, ARRAY_SIZE, MontType, false, false>(static_cast<U>(maxU - range), range, dummy, mmbr, seed, ebr));
1340+
bench_partial_array_pow<PTAG, 2, 5, ARRAY_SIZE, MontType, false, false>(static_cast<U>(maxU - range), range, dummy, mmbr, seed, ebr));
13411341

1342-
vecTimingPA.push_back(
1343-
bench_partial_array_pow<hc::LowlatencyTag, 2, 0, ARRAY_SIZE, MontType, false, false>(static_cast<U>(maxU - range), range, dummy, mmbr, seed, ebr));
1344-
vecTimingPA.push_back(
1345-
bench_partial_array_pow<hc::LowlatencyTag, 2, 3, ARRAY_SIZE, MontType, false, false>(static_cast<U>(maxU - range), range, dummy, mmbr, seed, ebr));
1346-
vecTimingPA.push_back(
1347-
bench_partial_array_pow<hc::LowlatencyTag, 2, 4, ARRAY_SIZE, MontType, false, false>(static_cast<U>(maxU - range), range, dummy, mmbr, seed, ebr));
1348-
vecTimingPA.push_back(
1349-
bench_partial_array_pow<hc::LowlatencyTag, 2, 5, ARRAY_SIZE, MontType, false, false>(static_cast<U>(maxU - range), range, dummy, mmbr, seed, ebr));
13501342

13511343
//template <class PTAG, size_t ARRAY_SIZE,
13521344
// class MontType, bool USE_SQUARING_VALUE_OPTIMIZATION, bool USE_SLIDING_WINDOW_OPTIMIZATION,
13531345
// typename U, typename ST>bench_PA_2(...)
13541346

1355-
bench_PA_2<hc::LowuopsTag, ARRAY_SIZE, MontType, false, false>(vecTimingPA, maxU, range, dummy, mmbr, seed, ebr);
1356-
bench_PA_2<hc::LowlatencyTag, ARRAY_SIZE, MontType, false, false>(vecTimingPA, maxU, range, dummy, mmbr, seed, ebr);
1347+
bench_PA_2<PTAG, ARRAY_SIZE, MontType, false, false>(vecTimingPA, maxU, range, dummy, mmbr, seed, ebr);
1348+
bench_PA_2<PTAG, ARRAY_SIZE, MontType, false, true>(vecTimingPA, maxU, range, dummy, mmbr, seed, ebr);
13571349

1358-
bench_PA_2<hc::LowuopsTag, ARRAY_SIZE, MontType, false, true>(vecTimingPA, maxU, range, dummy, mmbr, seed, ebr);
1359-
bench_PA_2<hc::LowlatencyTag, ARRAY_SIZE, MontType, false, true>(vecTimingPA, maxU, range, dummy, mmbr, seed, ebr);
1350+
1351+
if constexpr (std::is_same<typename MontType::MontType::MontyTag,
1352+
::hurchalla::detail::TagMontyFullrange>::value) {
1353+
bench_PA_2<PTAG, ARRAY_SIZE, MontType, true, false>(vecTimingPA, maxU, range, dummy, mmbr, seed, ebr);
1354+
bench_PA_2<PTAG, ARRAY_SIZE, MontType, true, true>(vecTimingPA, maxU, range, dummy, mmbr, seed, ebr);
1355+
}
13601356
}
13611357

13621358

13631359

13641360
template <size_t ARRAY_SIZE, class MontType, typename U, typename ST>
1365-
void bench_PA_use_sv(std::vector<TimingPA>& vecTimingPA,
1361+
void bench_PA_all(std::vector<TimingPA>& vecTimingPA,
13661362
U maxU, U range, U& dummy, unsigned int mmbr, ST seed, unsigned int ebr)
13671363
{
13681364
namespace hc = ::hurchalla;
13691365

1370-
//template <class PTAG, size_t ARRAY_SIZE,
1371-
// class MontType, bool USE_SQUARING_VALUE_OPTIMIZATION, bool USE_SLIDING_WINDOW_OPTIMIZATION,
1372-
// typename U, typename ST>bench_PA_2(...)
1373-
1374-
bench_PA_2<hc::LowuopsTag, ARRAY_SIZE, MontType, true, false>(vecTimingPA, maxU, range, dummy, mmbr, seed, ebr);
1375-
bench_PA_2<hc::LowlatencyTag, ARRAY_SIZE, MontType, true, false>(vecTimingPA, maxU, range, dummy, mmbr, seed, ebr);
1366+
//template <class PTAG, size_t ARRAY_SIZE, class MontType, typename U, typename ST>
1367+
//void bench_PA_PTAG(...
13761368

1377-
bench_PA_2<hc::LowuopsTag, ARRAY_SIZE, MontType, true, true>(vecTimingPA, maxU, range, dummy, mmbr, seed, ebr);
1378-
bench_PA_2<hc::LowlatencyTag, ARRAY_SIZE, MontType, true, true>(vecTimingPA, maxU, range, dummy, mmbr, seed, ebr);
1369+
bench_PA_PTAG<hc::LowuopsTag, ARRAY_SIZE, MontType>(vecTimingPA, maxU, range, dummy, mmbr, seed, ebr);
1370+
bench_PA_PTAG<hc::LowlatencyTag, ARRAY_SIZE, MontType>(vecTimingPA, maxU, range, dummy, mmbr, seed, ebr);
13791371
}
13801372

13811373

13821374

13831375

1376+
13841377
int main(int argc, char** argv)
13851378
{
13861379
namespace hc = hurchalla;
@@ -1485,32 +1478,17 @@ using namespace hurchalla;
14851478
for (size_t i=0; i<4; ++i) {
14861479
for (size_t j=0; j<timingPA[i].size(); ++j) {
14871480

1488-
#if 1
1489-
bench_PA_no_sv<2, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1490-
bench_PA_no_sv<3, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1491-
bench_PA_no_sv<4, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1492-
bench_PA_no_sv<5, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1493-
bench_PA_no_sv<6, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1494-
bench_PA_no_sv<7, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1495-
bench_PA_no_sv<8, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1496-
bench_PA_no_sv<10, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1497-
bench_PA_no_sv<12, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1498-
bench_PA_no_sv<14, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1481+
bench_PA_all<2, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1482+
bench_PA_all<3, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1483+
bench_PA_all<4, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1484+
bench_PA_all<5, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1485+
bench_PA_all<6, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1486+
bench_PA_all<7, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1487+
bench_PA_all<8, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1488+
bench_PA_all<10, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1489+
bench_PA_all<12, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1490+
bench_PA_all<14, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
14991491

1500-
if constexpr (std::is_same<typename MontType::MontType::MontyTag,
1501-
::hurchalla::detail::TagMontyFullrange>::value) {
1502-
bench_PA_use_sv<2, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1503-
bench_PA_use_sv<3, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1504-
bench_PA_use_sv<4, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1505-
bench_PA_use_sv<5, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1506-
bench_PA_use_sv<6, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1507-
bench_PA_use_sv<7, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1508-
bench_PA_use_sv<8, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1509-
bench_PA_use_sv<10, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1510-
bench_PA_use_sv<12, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1511-
bench_PA_use_sv<14, MontType>(timingPA[i][j], maxU, range, dummy, mmbr[i], seed, ebr[i]);
1512-
}
1513-
#endif
15141492
}
15151493
}
15161494
#ifdef TEST_CORRECTNESS_ONLY
@@ -2019,6 +1997,23 @@ std::cout << "Timings By Test Type:\n";
20191997
for (size_t i=0; i<4; ++i) {
20201998
for (size_t j=0; j<timings[i].size(); ++j) {
20211999

2000+
#if 1
2001+
// Partial array pow using ARRAY_SIZE 1 is essentially a scalar pow.
2002+
// We enter it into the main timing records with (code_section + 50) to distinguish it
2003+
//
2004+
// Note this is somewhat of a hack since we're assuming the tests and number of tests are
2005+
// the same for both bench_range() and bench_partial_array_pow(), which they are for now.
2006+
// If they weren't the same, then entering the PA timings into the main timings would
2007+
// result in invalid timing comparisons/rankings.
2008+
//
2009+
std::vector<TimingPA> vecTimingPA;
2010+
bench_PA_PTAG<hc::LowlatencyTag, 1, MontType>(vecTimingPA, maxU, range, dummy, mmbr[i], seed, ebr[i]);
2011+
for (auto timingPA : vecTimingPA) {
2012+
timings[i][j].push_back(Timing(timingPA.table_bits, timingPA.uses_sliding_window,
2013+
timingPA.code_section + 50, timingPA.time, timingPA.uses_squaring_values));
2014+
}
2015+
#endif
2016+
20222017
#if 1
20232018
timings[i][j].push_back(
20242019
bench_range<0, false, 0, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));

0 commit comments

Comments
 (0)