Skip to content

Commit 41bd907

Browse files
committed
add ARM64 inline-asm to 128bit REDC function
1 parent e5077b5 commit 41bd907

File tree

9 files changed

+625
-361
lines changed

9 files changed

+625
-361
lines changed

build_tests.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@
1818
# number of jobs will be used.
1919
# -r specifies to run all tests after the build. Without -r, no tests will run.
2020
# -a specifies you want to compile the code using typically helpful (how much it
21-
# helps depends on your compiler) inline asm optimizations, which makes for
22-
# the fastest binaries but of course has the downsides of inline asm -
23-
# primarily that inline asm is extremely difficult to properly test.
21+
# helps depends on your compiler) inline asm optimizations, which usually
22+
# makes the fastest binaries. It is probably better to use -u instead for
23+
# building these tests, since if we want to test some of the asm, we probably
24+
# want to test all of it.
2425
# -u specifies that you want to compile the code using all available inline asm
2526
# routines, so that the tests will cover all of them (this is not expected to
2627
# result in the fastest binaries).

modular_arithmetic/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ include(FetchContent)
7575
FetchContent_Declare(
7676
hurchalla_util
7777
GIT_REPOSITORY https://github.com/hurchalla/util.git
78-
GIT_TAG ba38c2c5e1164e0e7ab480e4fc959c3a43183c0e
78+
GIT_TAG 6901743704ac1caf4e99090ce52e52a40147ba82
7979
)
8080
FetchContent_MakeAvailable(hurchalla_util)
8181

montgomery_arithmetic/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ include(FetchContent)
7979
FetchContent_Declare(
8080
hurchalla_util
8181
GIT_REPOSITORY https://github.com/hurchalla/util.git
82-
GIT_TAG ba38c2c5e1164e0e7ab480e4fc959c3a43183c0e
82+
GIT_TAG 6901743704ac1caf4e99090ce52e52a40147ba82
8383
)
8484
FetchContent_MakeAvailable(hurchalla_util)
8585

montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/experimental/montgomery_two_pow/testbench_montgomery_two_pow.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -941,7 +941,7 @@ using namespace hurchalla;
941941
constexpr int NUM_TEST_REPETITIONS = 10;
942942

943943

944-
#if 0
944+
#if 1
945945
std::cout << "\nbegin benchmarks - array two_pow\n";
946946

947947
// warm up call
@@ -955,17 +955,17 @@ using namespace hurchalla;
955955
for (size_t j=0; j<timingA[i].size(); ++j) {
956956

957957
timingA[i][j].push_back(
958-
bench_array_two_pow<0, 31, 3, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
958+
bench_array_two_pow<0, 30, 3, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
959959
timingA[i][j].push_back(
960-
bench_array_two_pow<0, 31, 4, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
960+
bench_array_two_pow<0, 30, 4, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
961961
timingA[i][j].push_back(
962-
bench_array_two_pow<0, 31, 5, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
962+
bench_array_two_pow<0, 30, 5, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
963963
timingA[i][j].push_back(
964-
bench_array_two_pow<0, 31, 6, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
964+
bench_array_two_pow<0, 30, 6, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
965965
timingA[i][j].push_back(
966-
bench_array_two_pow<0, 31, 7, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
966+
bench_array_two_pow<0, 30, 7, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
967967
timingA[i][j].push_back(
968-
bench_array_two_pow<0, 31, 8, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
968+
bench_array_two_pow<0, 30, 8, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
969969

970970

971971
#if 0
@@ -1779,7 +1779,7 @@ std::cout << "Timings By Test Type:\n";
17791779

17801780

17811781

1782-
#if 1
1782+
#if 0
17831783
std::cout << "\nbegin benchmarks - scalar two_pow\n";
17841784

17851785
// warm up to get cpu boost (or throttle) going
@@ -1796,7 +1796,7 @@ std::cout << "Timings By Test Type:\n";
17961796
// format is bench_range<TABLE_BITS, USE_SLIDING_WINDOW_OPTIMIZATION, CODE_SECTION,
17971797
// MontType, USE_SQUARING_VALUE_OPTIMIZATION>
17981798
timings[i][j].push_back(
1799-
bench_range<0, false, 22, MontType, false>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
1799+
bench_range<0, false, 34, MontType, true>(static_cast<U>(maxU - range), range, dummy, mmbr[i], seed, ebr[i]));
18001800

18011801
#if 0
18021802
// This is a copy/paste of the "best of best" code sections from further below (nothing is new here).

0 commit comments

Comments
 (0)