Skip to content

Commit 3e81cbe

Browse files
author
Jeffrey Hurchalla
committed
remove the (brand new and broken) lowuops version of modular_addition_prereduced_inputs
1 parent ccdc256 commit 3e81cbe

File tree

9 files changed

+132
-255
lines changed

9 files changed

+132
-255
lines changed

build_tests.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -571,10 +571,10 @@ fi
571571

572572

573573
if [ "$run_tests" = true ]; then
574-
./$build_dir/test_ndebug_programming_by_contract --gtest_break_on_failure
575-
exit_on_failure
576-
./$build_dir/test_programming_by_contract --gtest_break_on_failure
577-
exit_on_failure
574+
# ./$build_dir/test_ndebug_programming_by_contract --gtest_break_on_failure
575+
# exit_on_failure
576+
# ./$build_dir/test_programming_by_contract --gtest_break_on_failure
577+
# exit_on_failure
578578
./$build_dir/test_hurchalla_util --gtest_break_on_failure
579579
exit_on_failure
580580
./$build_dir/test_hurchalla_modular_arithmetic --gtest_break_on_failure

modular_arithmetic/include/hurchalla/modular_arithmetic/detail/platform_specific/impl_modular_addition.h

Lines changed: 24 additions & 166 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
#define HURCHALLA_MODULAR_ARITHMETIC_IMPL_MODULAR_ADDITION_H_INCLUDED
1010

1111

12-
#include "hurchalla/modular_arithmetic/detail/optimization_tag_structs.h"
1312
#include "hurchalla/util/traits/extensible_make_unsigned.h"
1413
#include "hurchalla/util/traits/ut_numeric_limits.h"
1514
#include "hurchalla/util/conditional_select.h"
@@ -21,52 +20,17 @@
2120
namespace hurchalla { namespace detail {
2221

2322

24-
// With regard to the LowlatencyTag vs. the LowuopsTag functions:
25-
// If neither 'b' nor 'modulus' was set/modified recently before the call of
26-
// this modular addition function, then the LowlatencyTag versions will likely
27-
// provide lower latency than the LowuopsTag versions. Note that LowlatencyTag
28-
// will typically use more uops and create more pressure on the ALU than
29-
// LowuopsTag, unless the compiler can loop hoist the extra instruction(s)
30-
// involving 'b' and 'modulus'.
31-
3223
// Fyi: the purpose of having structs with static member functions is to
3324
// disallow ADL and to make specializations simple and easy.
3425

35-
// primary template for default implementation
36-
template <class PTAG>
37-
struct default_impl_modadd_unsigned {
38-
};
39-
40-
// --- Version #0 (for low uops and low ALU use) ---
41-
template <>
42-
struct default_impl_modadd_unsigned<LowuopsTag> {
43-
template <typename T>
44-
HURCHALLA_FORCE_INLINE static T call(T a, T b, T modulus)
45-
{
46-
static_assert(ut_numeric_limits<T>::is_integer, "");
47-
static_assert(!(ut_numeric_limits<T>::is_signed), "");
48-
HPBC_PRECONDITION2(modulus>0);
49-
HPBC_PRECONDITION2(a<modulus); // i.e. the input must be prereduced
50-
HPBC_PRECONDITION2(b<modulus); // i.e. the input must be prereduced
51-
52-
T sum = static_cast<T>(a + b);
53-
T result = static_cast<T>(sum - modulus);
54-
// result = (sum < modulus) ? sum : result;
55-
result = ::hurchalla::conditional_select(sum < modulus, sum, result);
56-
57-
HPBC_POSTCONDITION2(static_cast<T>(0) <= result && result < modulus);
58-
return result;
59-
}
60-
};
61-
6226
#if !defined(__clang__)
6327
// Note: only clang (on x64 and arm32/64) seems to produce optimal code from
6428
// the theoretically preferable Version 2 further below. And so for gcc(x64 and
6529
// arm32/64 and risc-V), MSVC (x64 and arm64), and icc, Version #1 here tends to
6630
// compile to better machine code in practice. Gcc in particular tends to
6731
// generate conditional branches for Version 2, which we don't want.
6832

69-
// --- LowlatencyTag Version #1 ---
33+
// --- Version #1 ---
7034
// This is a relatively straightforward and easy to understand version.
7135
//
7236
// However, on x86, Version #1's use of (modulus - b) will often require two
@@ -85,8 +49,7 @@ struct default_impl_modadd_unsigned<LowuopsTag> {
8549
// between the two function versions. We would generally expect the latency of
8650
// the two versions to be the same, but as always this depends on whether the
8751
// compiler generates good (or not good) machine code.
88-
template <>
89-
struct default_impl_modadd_unsigned<LowlatencyTag> {
52+
struct default_impl_modadd_unsigned {
9053
template <typename T>
9154
HURCHALLA_FORCE_INLINE static T call(T a, T b, T modulus)
9255
{
@@ -113,13 +76,12 @@ struct default_impl_modadd_unsigned<LowlatencyTag> {
11376
};
11477
#else
11578

116-
// --- LowlatencyTag Version #2 ---
79+
// --- Version #2 ---
11780
// This is a more difficult to understand default implementation version. The
11881
// proof of this function's correctness is given by the theorem in the comments
11982
// at the end of this file. See the notes at the end of those comments to
12083
// understand the implementation details.
121-
template <>
122-
struct default_impl_modadd_unsigned<LowlatencyTag> {
84+
struct default_impl_modadd_unsigned {
12385
template <typename T>
12486
HURCHALLA_FORCE_INLINE static T call(T a, T b, T modulus)
12587
{
@@ -145,17 +107,17 @@ struct default_impl_modadd_unsigned<LowlatencyTag> {
145107

146108

147109
// primary template
148-
template <typename T, class PTAG>
110+
template <typename T>
149111
struct impl_modular_addition_unsigned {
150112
HURCHALLA_FORCE_INLINE static T call(T a, T b, T modulus)
151113
{
152-
return default_impl_modadd_unsigned<PTAG>::call(a, b, modulus);
114+
return default_impl_modadd_unsigned::call(a, b, modulus);
153115
}
154116
};
155117

156118

157119
// These inline asm functions implement optimizations of the default function
158-
// versions #0 and #2 (above), for LowuopsTag and LowlatencyTag respectively.
120+
// version #2 (above).
159121

160122
// MSVC doesn't support inline asm, so we skip it.
161123

@@ -164,13 +126,13 @@ struct impl_modular_addition_unsigned {
164126
defined(HURCHALLA_TARGET_ISA_X86_64) && !defined(_MSC_VER)
165127

166128

167-
// These LowlatencyTag functions contain the calculation "b - modulus". If
168-
// neither 'b' nor 'modulus' was recently set/modified, then "b - modulus"
169-
// will usually be calculated at the same time as earlier work by the CPU, or
170-
// in a loop it could potentially be loop hoisted by the compiler. Either way,
171-
// this potentially allows lower latency than the LowuopsTag version.
129+
// Note: these functions contain the calculation "b - modulus". If neither 'b'
130+
// nor 'modulus' was recently set/modified, then "b - modulus" will usually be
131+
// calculated at the same time as earlier work by the CPU, or in a loop it could
132+
// potentially be loop hoisted by the compiler. This is what provides a
133+
// potential for lowered latency.
172134
template <>
173-
struct impl_modular_addition_unsigned<std::uint32_t, LowlatencyTag> {
135+
struct impl_modular_addition_unsigned<std::uint32_t> {
174136
HURCHALLA_FORCE_INLINE static
175137
std::uint32_t call(std::uint32_t a, std::uint32_t b, std::uint32_t modulus)
176138
{
@@ -201,13 +163,13 @@ struct impl_modular_addition_unsigned<std::uint32_t, LowlatencyTag> {
201163

202164
HPBC_POSTCONDITION2(result < modulus); // uint32_t guarantees result>=0.
203165
HPBC_POSTCONDITION2(result ==
204-
default_impl_modadd_unsigned<LowlatencyTag>::call(a, b, modulus));
166+
default_impl_modadd_unsigned::call(a, b, modulus));
205167
return result;
206168
}
207169
};
208170

209171
template <>
210-
struct impl_modular_addition_unsigned<std::uint64_t, LowlatencyTag> {
172+
struct impl_modular_addition_unsigned<std::uint64_t> {
211173
HURCHALLA_FORCE_INLINE static
212174
std::uint64_t call(std::uint64_t a, std::uint64_t b, std::uint64_t modulus)
213175
{
@@ -232,14 +194,14 @@ struct impl_modular_addition_unsigned<std::uint64_t, LowlatencyTag> {
232194

233195
HPBC_POSTCONDITION2(result < modulus); // uint64_t guarantees result>=0.
234196
HPBC_POSTCONDITION2(result ==
235-
default_impl_modadd_unsigned<LowlatencyTag>::call(a, b, modulus));
197+
default_impl_modadd_unsigned::call(a, b, modulus));
236198
return result;
237199
}
238200
};
239201

240202
#ifdef HURCHALLA_ENABLE_INLINE_ASM_128_BIT
241203
template <>
242-
struct impl_modular_addition_unsigned<__uint128_t, LowlatencyTag> {
204+
struct impl_modular_addition_unsigned<__uint128_t> {
243205
HURCHALLA_FORCE_INLINE static
244206
__uint128_t call(__uint128_t a, __uint128_t b, __uint128_t modulus)
245207
{
@@ -271,109 +233,7 @@ struct impl_modular_addition_unsigned<__uint128_t, LowlatencyTag> {
271233

272234
HPBC_POSTCONDITION2(result < modulus); // __uint128_t guarantees result>=0.
273235
HPBC_POSTCONDITION2(result ==
274-
default_impl_modadd_unsigned<LowlatencyTag>::call(a, b, modulus));
275-
return result;
276-
}
277-
};
278-
#endif
279-
280-
281-
// These LowuopsTag versions should have the lowest ALU use (one add, one sub),
282-
// and often the lowest uop count.
283-
template <>
284-
struct impl_modular_addition_unsigned<std::uint32_t, LowuopsTag> {
285-
HURCHALLA_FORCE_INLINE static
286-
std::uint32_t call(std::uint32_t a, std::uint32_t b, std::uint32_t modulus)
287-
{
288-
using std::uint32_t;
289-
HPBC_PRECONDITION2(modulus>0);
290-
HPBC_PRECONDITION2(a<modulus); // uint32_t guarantees a>=0.
291-
HPBC_PRECONDITION2(b<modulus); // uint32_t guarantees b>=0.
292-
293-
uint32_t sum = static_cast<uint32_t>(a + b);
294-
uint32_t tmp = sum;
295-
__asm__ ("subl %[m], %[sum] \n\t" /* tmp2 = sum - m */
296-
"cmovbl %[tmp], %[sum] \n\t" /* sum = (sum<m) ? tmp : tmp2 */
297-
: [sum]"+&r"(sum)
298-
# if defined(__clang__) /* https://bugs.llvm.org/show_bug.cgi?id=20197 */
299-
: [m]"r"(modulus), [tmp]"r"(tmp)
300-
# else
301-
: [m]"rm"(modulus), [tmp]"rm"(tmp)
302-
# endif
303-
: "cc");
304-
uint32_t result = sum;
305-
306-
HPBC_POSTCONDITION2(result < modulus); // uint32_t guarantees result>=0.
307-
HPBC_POSTCONDITION2(result ==
308-
default_impl_modadd_unsigned<LowuopsTag>::call(a, b, modulus));
309-
return result;
310-
}
311-
};
312-
313-
template <>
314-
struct impl_modular_addition_unsigned<std::uint64_t, LowuopsTag> {
315-
HURCHALLA_FORCE_INLINE static
316-
std::uint64_t call(std::uint64_t a, std::uint64_t b, std::uint64_t modulus)
317-
{
318-
using std::uint64_t;
319-
HPBC_PRECONDITION2(modulus>0);
320-
HPBC_PRECONDITION2(a<modulus); // uint64_t guarantees a>=0.
321-
HPBC_PRECONDITION2(b<modulus); // uint64_t guarantees b>=0.
322-
323-
uint64_t sum = static_cast<uint64_t>(a + b);
324-
uint64_t tmp = sum;
325-
__asm__ ("subq %[m], %[sum] \n\t" /* tmp2 = sum - m */
326-
"cmovbq %[tmp], %[sum] \n\t" /* sum = (sum<m) ? tmp : tmp2 */
327-
: [sum]"+&r"(sum)
328-
# if defined(__clang__) /* https://bugs.llvm.org/show_bug.cgi?id=20197 */
329-
: [m]"r"(modulus), [tmp]"r"(tmp)
330-
# else
331-
: [m]"rm"(modulus), [tmp]"rm"(tmp)
332-
# endif
333-
: "cc");
334-
uint64_t result = sum;
335-
336-
HPBC_POSTCONDITION2(result < modulus); // uint64_t guarantees result>=0.
337-
HPBC_POSTCONDITION2(result ==
338-
default_impl_modadd_unsigned<LowuopsTag>::call(a, b, modulus));
339-
return result;
340-
}
341-
};
342-
343-
#ifdef HURCHALLA_ENABLE_INLINE_ASM_128_BIT
344-
template <>
345-
struct impl_modular_addition_unsigned<__uint128_t, LowuopsTag> {
346-
HURCHALLA_FORCE_INLINE static
347-
__uint128_t call(__uint128_t a, __uint128_t b, __uint128_t modulus)
348-
{
349-
using std::uint64_t;
350-
HPBC_PRECONDITION2(modulus>0);
351-
HPBC_PRECONDITION2(a<modulus); // __uint128_t guarantees a>=0.
352-
HPBC_PRECONDITION2(b<modulus); // __uint128_t guarantees b>=0.
353-
354-
__uint128_t sum = static_cast<__uint128_t>(a + b);
355-
uint64_t sumlo = static_cast<uint64_t>(sum);
356-
uint64_t sumhi = static_cast<uint64_t>(sum >> 64);
357-
uint64_t tmplo = sumlo;
358-
uint64_t tmphi = sumhi;
359-
uint64_t mlo = static_cast<uint64_t>(modulus);
360-
uint64_t mhi = static_cast<uint64_t>(modulus >> 64);
361-
__asm__ ("subq %[mlo], %[sumlo] \n\t" /* tmp2 = sum - m */
362-
"sbbq %[mhi], %[sumhi] \n\t"
363-
"cmovbq %[tmplo], %[sumlo] \n\t" /* sum = (sum<m) ? tmp : tmp2 */
364-
"cmovbq %[tmphi], %[sumhi] \n\t"
365-
: [sumlo]"+&r"(sumlo), [sumhi]"+&r"(sumhi)
366-
# if defined(__clang__) /* https://bugs.llvm.org/show_bug.cgi?id=20197 */
367-
: [mlo]"r"(mlo), [mhi]"r"(mhi), [tmplo]"r"(tmplo), [tmphi]"r"(tmphi)
368-
# else
369-
: [mlo]"rm"(mlo), [mhi]"rm"(mhi), [tmplo]"rm"(tmplo), [tmphi]"rm"(tmphi)
370-
# endif
371-
: "cc");
372-
__uint128_t result = (static_cast<__uint128_t>(sumhi) << 64) | sumlo;
373-
374-
HPBC_POSTCONDITION2(result < modulus); // __uint128_t guarantees result>=0.
375-
HPBC_POSTCONDITION2(result ==
376-
default_impl_modadd_unsigned<LowuopsTag>::call(a, b, modulus));
236+
default_impl_modadd_unsigned::call(a, b, modulus));
377237
return result;
378238
}
379239
};
@@ -383,23 +243,21 @@ struct impl_modular_addition_unsigned<__uint128_t, LowuopsTag> {
383243
#endif
384244

385245

386-
// You must use either LowlatencyTag or LowuopsTag for PTAG. See comment at
387-
// top of this file for details.
388246

389247
// version for unsigned T
390-
template <typename T, class PTAG, bool = ut_numeric_limits<T>::is_signed>
248+
template <typename T, bool = ut_numeric_limits<T>::is_signed>
391249
struct impl_modular_addition {
392250
HURCHALLA_FORCE_INLINE static T call(T a, T b, T modulus)
393251
{
394252
static_assert(ut_numeric_limits<T>::is_integer, "");
395253
static_assert(!(ut_numeric_limits<T>::is_signed), "");
396-
return impl_modular_addition_unsigned<T,PTAG>::call(a, b, modulus);
254+
return impl_modular_addition_unsigned<T>::call(a, b, modulus);
397255
}
398256
};
399257

400258
// version for signed T
401-
template <typename T, class PTAG>
402-
struct impl_modular_addition<T, PTAG, true> {
259+
template <typename T>
260+
struct impl_modular_addition<T, true> {
403261
HURCHALLA_FORCE_INLINE static T call(T a, T b, T modulus)
404262
{
405263
static_assert(ut_numeric_limits<T>::is_integer, "");
@@ -424,10 +282,10 @@ struct impl_modular_addition<T, PTAG, true> {
424282
U mask = static_cast<U>(tmp >> ut_numeric_limits<T>::digits);
425283
U masked_modulus = static_cast<U>(mask & static_cast<U>(modulus));
426284
U result = static_cast<U>(static_cast<U>(tmp) + masked_modulus);
427-
HPBC_ASSERT2(result == impl_modular_addition_unsigned<U,PTAG>::call(
285+
HPBC_ASSERT2(result == impl_modular_addition_unsigned<U>::call(
428286
static_cast<U>(a), static_cast<U>(b), static_cast<U>(modulus)));
429287
#else
430-
U result = impl_modular_addition_unsigned<U,PTAG>::call(static_cast<U>(a),
288+
U result = impl_modular_addition_unsigned<U>::call(static_cast<U>(a),
431289
static_cast<U>(b), static_cast<U>(modulus));
432290
#endif
433291
return static_cast<T>(result);

modular_arithmetic/include/hurchalla/modular_arithmetic/detail/platform_specific/impl_modular_multiplication.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,10 @@ struct slow_modular_multiplication {
6868
T result = 0;
6969
while (b > 0) {
7070
namespace hc = ::hurchalla;
71-
T tmp = hc::modular_addition_prereduced_inputs<T,LowlatencyTag>(
72-
a, result, modulus);
71+
T tmp = hc::modular_addition_prereduced_inputs(a, result, modulus);
7372
// result = (b&1) ? tmp : result
7473
result = hc::conditional_select((b & 1u), tmp, result);
75-
a = hc::modular_addition_prereduced_inputs<T,LowlatencyTag>(
76-
a, a, modulus);
74+
a = hc::modular_addition_prereduced_inputs(a, a, modulus);
7775
b = static_cast<T>(b >> 1);
7876
}
7977
return result;

0 commit comments

Comments
 (0)