99#define HURCHALLA_MODULAR_ARITHMETIC_IMPL_MODULAR_ADDITION_H_INCLUDED
1010
1111
12- #include " hurchalla/modular_arithmetic/detail/optimization_tag_structs.h"
1312#include " hurchalla/util/traits/extensible_make_unsigned.h"
1413#include " hurchalla/util/traits/ut_numeric_limits.h"
1514#include " hurchalla/util/conditional_select.h"
2120namespace hurchalla { namespace detail {
2221
2322
24- // With regard to the LowlatencyTag vs. the LowuopsTag functions:
25- // If neither 'b' nor 'modulus' was set/modified recently before the call of
26- // this modular addition function, then the LowlatencyTag versions will likely
27- // provide lower latency than the LowuopsTag versions. Note that LowlatencyTag
28- // will typically use more uops and create more pressure on the ALU than
29- // LowuopsTag, unless the compiler can loop hoist the extra instruction(s)
30- // involving 'b' and 'modulus'.
31-
3223// Fyi: the purpose of having structs with static member functions is to
3324// disallow ADL and to make specializations simple and easy.
3425
35- // primary template for default implementation
36- template <class PTAG >
37- struct default_impl_modadd_unsigned {
38- };
39-
40- // --- Version #0 (for low uops and low ALU use) ---
41- template <>
42- struct default_impl_modadd_unsigned <LowuopsTag> {
43- template <typename T>
44- HURCHALLA_FORCE_INLINE static T call (T a, T b, T modulus)
45- {
46- static_assert (ut_numeric_limits<T>::is_integer, " " );
47- static_assert (!(ut_numeric_limits<T>::is_signed), " " );
48- HPBC_PRECONDITION2 (modulus>0 );
49- HPBC_PRECONDITION2 (a<modulus); // i.e. the input must be prereduced
50- HPBC_PRECONDITION2 (b<modulus); // i.e. the input must be prereduced
51-
52- T sum = static_cast <T>(a + b);
53- T result = static_cast <T>(sum - modulus);
54- // result = (sum < modulus) ? sum : result;
55- result = ::hurchalla::conditional_select (sum < modulus, sum, result);
56-
57- HPBC_POSTCONDITION2 (static_cast <T>(0 ) <= result && result < modulus);
58- return result;
59- }
60- };
61-
6226#if !defined(__clang__)
6327// Note: only clang (on x64 and arm32/64) seems to produce optimal code from
6428// the theoretically preferable Version 2 further below. And so for gcc(x64 and
6529// arm32/64 and risc-V), MSVC (x64 and arm64), and icc, Version #1 here tends to
6630// compile to better machine code in practice. Gcc in particular tends to
6731// generate conditional branches for Version 2, which we don't want.
6832
69- // --- LowlatencyTag Version #1 ---
33+ // --- Version #1 ---
7034// This is a relatively straightforward and easy to understand version.
7135//
7236// However, on x86, Version #1's use of (modulus - b) will often require two
@@ -85,8 +49,7 @@ struct default_impl_modadd_unsigned<LowuopsTag> {
8549// between the two function versions. We would generally expect the latency of
8650// the two versions to be the same, but as always this depends on whether the
8751// compiler generates good (or not good) machine code.
88- template <>
89- struct default_impl_modadd_unsigned <LowlatencyTag> {
52+ struct default_impl_modadd_unsigned {
9053 template <typename T>
9154 HURCHALLA_FORCE_INLINE static T call (T a, T b, T modulus)
9255 {
@@ -113,13 +76,12 @@ struct default_impl_modadd_unsigned<LowlatencyTag> {
11376};
11477#else
11578
116- // --- LowlatencyTag Version #2 ---
79+ // --- Version #2 ---
11780// This is a more difficult to understand default implementation version. The
11881// proof of this function's correctness is given by the theorem in the comments
11982// at the end of this file. See the notes at the end of those comments to
12083// understand the implementation details.
121- template <>
122- struct default_impl_modadd_unsigned <LowlatencyTag> {
84+ struct default_impl_modadd_unsigned {
12385 template <typename T>
12486 HURCHALLA_FORCE_INLINE static T call (T a, T b, T modulus)
12587 {
@@ -145,17 +107,17 @@ struct default_impl_modadd_unsigned<LowlatencyTag> {
145107
146108
147109// primary template
148- template <typename T, class PTAG >
110+ template <typename T>
149111struct impl_modular_addition_unsigned {
150112 HURCHALLA_FORCE_INLINE static T call (T a, T b, T modulus)
151113 {
152- return default_impl_modadd_unsigned<PTAG> ::call (a, b, modulus);
114+ return default_impl_modadd_unsigned::call (a, b, modulus);
153115 }
154116};
155117
156118
157119// These inline asm functions implement optimizations of the default function
158- // versions #0 and # 2 (above), for LowuopsTag and LowlatencyTag respectively .
120+ // version # 2 (above).
159121
160122// MSVC doesn't support inline asm, so we skip it.
161123
@@ -164,13 +126,13 @@ struct impl_modular_addition_unsigned {
164126 defined(HURCHALLA_TARGET_ISA_X86_64) && !defined(_MSC_VER)
165127
166128
167- // These LowlatencyTag functions contain the calculation "b - modulus". If
168- // neither 'b' nor 'modulus' was recently set/modified, then "b - modulus"
169- // will usually be calculated at the same time as earlier work by the CPU, or
170- // in a loop it could potentially be loop hoisted by the compiler. Either way,
171- // this potentially allows lower latency than the LowuopsTag version .
129+ // Note: these functions contain the calculation "b - modulus". If neither 'b'
130+ // nor 'modulus' was recently set/modified, then "b - modulus" will usually be
131+ // calculated at the same time as earlier work by the CPU, or in a loop it could
132+ // potentially be loop hoisted by the compiler. This is what provides a
133+ // potential for lowered latency.
172134template <>
173- struct impl_modular_addition_unsigned<std::uint32_t, LowlatencyTag > {
135+ struct impl_modular_addition_unsigned<std::uint32_t> {
174136 HURCHALLA_FORCE_INLINE static
175137 std::uint32_t call (std::uint32_t a, std::uint32_t b, std::uint32_t modulus)
176138 {
@@ -201,13 +163,13 @@ struct impl_modular_addition_unsigned<std::uint32_t, LowlatencyTag> {
201163
202164 HPBC_POSTCONDITION2 (result < modulus); // uint32_t guarantees result>=0.
203165 HPBC_POSTCONDITION2 (result ==
204- default_impl_modadd_unsigned<LowlatencyTag> ::call (a, b, modulus));
166+ default_impl_modadd_unsigned::call (a, b, modulus));
205167 return result;
206168 }
207169};
208170
209171template <>
210- struct impl_modular_addition_unsigned <std::uint64_t , LowlatencyTag > {
172+ struct impl_modular_addition_unsigned <std::uint64_t > {
211173 HURCHALLA_FORCE_INLINE static
212174 std::uint64_t call (std::uint64_t a, std::uint64_t b, std::uint64_t modulus)
213175 {
@@ -232,14 +194,14 @@ struct impl_modular_addition_unsigned<std::uint64_t, LowlatencyTag> {
232194
233195 HPBC_POSTCONDITION2 (result < modulus); // uint64_t guarantees result>=0.
234196 HPBC_POSTCONDITION2 (result ==
235- default_impl_modadd_unsigned<LowlatencyTag> ::call (a, b, modulus));
197+ default_impl_modadd_unsigned::call (a, b, modulus));
236198 return result;
237199 }
238200};
239201
240202#ifdef HURCHALLA_ENABLE_INLINE_ASM_128_BIT
241203template <>
242- struct impl_modular_addition_unsigned <__uint128_t , LowlatencyTag > {
204+ struct impl_modular_addition_unsigned <__uint128_t > {
243205 HURCHALLA_FORCE_INLINE static
244206 __uint128_t call (__uint128_t a, __uint128_t b, __uint128_t modulus)
245207 {
@@ -271,109 +233,7 @@ struct impl_modular_addition_unsigned<__uint128_t, LowlatencyTag> {
271233
272234 HPBC_POSTCONDITION2 (result < modulus); // __uint128_t guarantees result>=0.
273235 HPBC_POSTCONDITION2 (result ==
274- default_impl_modadd_unsigned<LowlatencyTag>::call (a, b, modulus));
275- return result;
276- }
277- };
278- #endif
279-
280-
281- // These LowuopsTag versions should have the lowest ALU use (one add, one sub),
282- // and often the lowest uop count.
283- template <>
284- struct impl_modular_addition_unsigned <std::uint32_t , LowuopsTag> {
285- HURCHALLA_FORCE_INLINE static
286- std::uint32_t call (std::uint32_t a, std::uint32_t b, std::uint32_t modulus)
287- {
288- using std::uint32_t ;
289- HPBC_PRECONDITION2 (modulus>0 );
290- HPBC_PRECONDITION2 (a<modulus); // uint32_t guarantees a>=0.
291- HPBC_PRECONDITION2 (b<modulus); // uint32_t guarantees b>=0.
292-
293- uint32_t sum = static_cast <uint32_t >(a + b);
294- uint32_t tmp = sum;
295- __asm__ (" subl %[m], %[sum] \n\t " /* tmp2 = sum - m */
296- " cmovbl %[tmp], %[sum] \n\t " /* sum = (sum<m) ? tmp : tmp2 */
297- : [sum]" +&r" (sum)
298- # if defined(__clang__) /* https://bugs.llvm.org/show_bug.cgi?id=20197 */
299- : [m]" r" (modulus), [tmp]" r" (tmp)
300- # else
301- : [m]" rm" (modulus), [tmp]" rm" (tmp)
302- # endif
303- : " cc" );
304- uint32_t result = sum;
305-
306- HPBC_POSTCONDITION2 (result < modulus); // uint32_t guarantees result>=0.
307- HPBC_POSTCONDITION2 (result ==
308- default_impl_modadd_unsigned<LowuopsTag>::call (a, b, modulus));
309- return result;
310- }
311- };
312-
313- template <>
314- struct impl_modular_addition_unsigned <std::uint64_t , LowuopsTag> {
315- HURCHALLA_FORCE_INLINE static
316- std::uint64_t call (std::uint64_t a, std::uint64_t b, std::uint64_t modulus)
317- {
318- using std::uint64_t ;
319- HPBC_PRECONDITION2 (modulus>0 );
320- HPBC_PRECONDITION2 (a<modulus); // uint64_t guarantees a>=0.
321- HPBC_PRECONDITION2 (b<modulus); // uint64_t guarantees b>=0.
322-
323- uint64_t sum = static_cast <uint64_t >(a + b);
324- uint64_t tmp = sum;
325- __asm__ (" subq %[m], %[sum] \n\t " /* tmp2 = sum - m */
326- " cmovbq %[tmp], %[sum] \n\t " /* sum = (sum<m) ? tmp : tmp2 */
327- : [sum]" +&r" (sum)
328- # if defined(__clang__) /* https://bugs.llvm.org/show_bug.cgi?id=20197 */
329- : [m]" r" (modulus), [tmp]" r" (tmp)
330- # else
331- : [m]" rm" (modulus), [tmp]" rm" (tmp)
332- # endif
333- : " cc" );
334- uint64_t result = sum;
335-
336- HPBC_POSTCONDITION2 (result < modulus); // uint64_t guarantees result>=0.
337- HPBC_POSTCONDITION2 (result ==
338- default_impl_modadd_unsigned<LowuopsTag>::call (a, b, modulus));
339- return result;
340- }
341- };
342-
343- #ifdef HURCHALLA_ENABLE_INLINE_ASM_128_BIT
344- template <>
345- struct impl_modular_addition_unsigned <__uint128_t , LowuopsTag> {
346- HURCHALLA_FORCE_INLINE static
347- __uint128_t call (__uint128_t a, __uint128_t b, __uint128_t modulus)
348- {
349- using std::uint64_t ;
350- HPBC_PRECONDITION2 (modulus>0 );
351- HPBC_PRECONDITION2 (a<modulus); // __uint128_t guarantees a>=0.
352- HPBC_PRECONDITION2 (b<modulus); // __uint128_t guarantees b>=0.
353-
354- __uint128_t sum = static_cast <__uint128_t >(a + b);
355- uint64_t sumlo = static_cast <uint64_t >(sum);
356- uint64_t sumhi = static_cast <uint64_t >(sum >> 64 );
357- uint64_t tmplo = sumlo;
358- uint64_t tmphi = sumhi;
359- uint64_t mlo = static_cast <uint64_t >(modulus);
360- uint64_t mhi = static_cast <uint64_t >(modulus >> 64 );
361- __asm__ (" subq %[mlo], %[sumlo] \n\t " /* tmp2 = sum - m */
362- " sbbq %[mhi], %[sumhi] \n\t "
363- " cmovbq %[tmplo], %[sumlo] \n\t " /* sum = (sum<m) ? tmp : tmp2 */
364- " cmovbq %[tmphi], %[sumhi] \n\t "
365- : [sumlo]" +&r" (sumlo), [sumhi]" +&r" (sumhi)
366- # if defined(__clang__) /* https://bugs.llvm.org/show_bug.cgi?id=20197 */
367- : [mlo]" r" (mlo), [mhi]" r" (mhi), [tmplo]" r" (tmplo), [tmphi]" r" (tmphi)
368- # else
369- : [mlo]" rm" (mlo), [mhi]" rm" (mhi), [tmplo]" rm" (tmplo), [tmphi]" rm" (tmphi)
370- # endif
371- : " cc" );
372- __uint128_t result = (static_cast <__uint128_t >(sumhi) << 64 ) | sumlo;
373-
374- HPBC_POSTCONDITION2 (result < modulus); // __uint128_t guarantees result>=0.
375- HPBC_POSTCONDITION2 (result ==
376- default_impl_modadd_unsigned<LowuopsTag>::call (a, b, modulus));
236+ default_impl_modadd_unsigned::call (a, b, modulus));
377237 return result;
378238 }
379239};
@@ -383,23 +243,21 @@ struct impl_modular_addition_unsigned<__uint128_t, LowuopsTag> {
383243#endif
384244
385245
386- // You must use either LowlatencyTag or LowuopsTag for PTAG. See comment at
387- // top of this file for details.
388246
389247// version for unsigned T
390- template <typename T, class PTAG , bool = ut_numeric_limits<T>::is_signed>
248+ template <typename T, bool = ut_numeric_limits<T>::is_signed>
391249struct impl_modular_addition {
392250 HURCHALLA_FORCE_INLINE static T call (T a, T b, T modulus)
393251 {
394252 static_assert (ut_numeric_limits<T>::is_integer, " " );
395253 static_assert (!(ut_numeric_limits<T>::is_signed), " " );
396- return impl_modular_addition_unsigned<T,PTAG >::call (a, b, modulus);
254+ return impl_modular_addition_unsigned<T>::call (a, b, modulus);
397255 }
398256};
399257
400258// version for signed T
401- template <typename T, class PTAG >
402- struct impl_modular_addition <T, PTAG, true > {
259+ template <typename T>
260+ struct impl_modular_addition <T, true > {
403261 HURCHALLA_FORCE_INLINE static T call (T a, T b, T modulus)
404262 {
405263 static_assert (ut_numeric_limits<T>::is_integer, " " );
@@ -424,10 +282,10 @@ struct impl_modular_addition<T, PTAG, true> {
424282 U mask = static_cast <U>(tmp >> ut_numeric_limits<T>::digits);
425283 U masked_modulus = static_cast <U>(mask & static_cast <U>(modulus));
426284 U result = static_cast <U>(static_cast <U>(tmp) + masked_modulus);
427- HPBC_ASSERT2 (result == impl_modular_addition_unsigned<U,PTAG >::call (
285+ HPBC_ASSERT2 (result == impl_modular_addition_unsigned<U>::call (
428286 static_cast <U>(a), static_cast <U>(b), static_cast <U>(modulus)));
429287#else
430- U result = impl_modular_addition_unsigned<U,PTAG >::call (static_cast <U>(a),
288+ U result = impl_modular_addition_unsigned<U>::call (static_cast <U>(a),
431289 static_cast <U>(b), static_cast <U>(modulus));
432290#endif
433291 return static_cast <T>(result);
0 commit comments