@@ -37,106 +37,233 @@ template <class MontyTag, class CompilerTag, class PowSizeTag>
3737struct tagged_montgomery_two_pow {};
3838
3939
40- // -- Generic tunings, measured with mac M2 with no asm enabled --
4140
42- // Full Specialization: clang and big uint pow and MontgomeryFull.
41+
42+ // These specializations call impl_montgomery_two_pow, using the template
43+ // arguments that were found to perform best in benchmarks for a corresponding
44+ // configuration of compiler/uint_type/Mont_type
45+
46+
47+ // Fyi, the meaning of the impl_montgomery_two_pow::call template parameters are
48+ // scalar call
49+ // template <class MF, typename U,
50+ // bool USE_SLIDING_WINDOW_OPTIMIZATION,
51+ // size_t TABLE_BITS, size_t CODE_SECTION,
52+ // bool USE_SQUARING_VALUE_OPTIMIZATION>
53+ //
54+ // array call
55+ // template <class MF, typename U,
56+ // size_t ARRAY_SIZE, size_t TABLE_BITS, size_t CODE_SECTION,
57+ // bool USE_SQUARING_VALUE_OPTIMIZATION>
58+
59+
60+ // -- the following best performance tunings were measured with mac M2 --
61+
62+
63+ // Partial Specialization: clang and big uint pow.
64+ // Intended for MontgomeryFull, but catches all non-specialized monty types
65+ template <class MontyTag > struct tagged_montgomery_two_pow
66+ <MontyTag, Tag_montgomery_two_pow_clang, Tag_montgomery_two_pow_big>
67+ {
68+ template <class MF , typename U> HURCHALLA_FORCE_INLINE
69+ static typename MF::MontgomeryValue call (const MF& mf, U n)
70+ {
71+ return impl_montgomery_two_pow::call<MF, U, false , 0 , 34 , true >(mf, n);
72+ }
73+ template <class MF , typename U, std::size_t ARRAY_SIZE> HURCHALLA_FORCE_INLINE
74+ static std::array<typename MF::MontgomeryValue, ARRAY_SIZE>
75+ call (const std::array<MF, ARRAY_SIZE>& mf, const std::array<U, ARRAY_SIZE>& n)
76+ {
77+ return impl_montgomery_two_pow::call<MF, U, ARRAY_SIZE, 0 , 30 , false >(mf, n);
78+ }
79+ };
80+ // Full Specialization: clang and big uint pow and MontgomeryHalf.
4381template <> struct tagged_montgomery_two_pow
44- <TagMontyFullrange , Tag_montgomery_two_pow_clang, Tag_montgomery_two_pow_big>
82+ <TagMontyHalfrange , Tag_montgomery_two_pow_clang, Tag_montgomery_two_pow_big>
4583{
4684 template <class MF , typename U> HURCHALLA_FORCE_INLINE
4785 static typename MF::MontgomeryValue call (const MF& mf, U n)
4886 {
49- return impl_montgomery_two_pow::call<true , 0 , 3 , MF, U >(mf, n);
87+ return impl_montgomery_two_pow::call<MF, U, false , 0 , 22 , false >(mf, n);
5088 }
5189 template <class MF , typename U, std::size_t ARRAY_SIZE> HURCHALLA_FORCE_INLINE
5290 static std::array<typename MF::MontgomeryValue, ARRAY_SIZE>
5391 call (const std::array<MF, ARRAY_SIZE>& mf, const std::array<U, ARRAY_SIZE>& n)
5492 {
55- return impl_montgomery_two_pow::call<0 , 0 , MF, U, ARRAY_SIZE >(mf, n);
93+ return impl_montgomery_two_pow::call<MF, U, ARRAY_SIZE, 0 , 31 , false >(mf, n);
5694 }
5795};
58- // Partial specialization : clang and big uint pow.
59- template <class MontyTag > struct tagged_montgomery_two_pow
60- <MontyTag , Tag_montgomery_two_pow_clang, Tag_montgomery_two_pow_big>
96+ // Full Specialization : clang and big uint pow and MontgomeryQuarter .
97+ template <> struct tagged_montgomery_two_pow
98+ <TagMontyQuarterrange , Tag_montgomery_two_pow_clang, Tag_montgomery_two_pow_big>
6199{
62100 template <class MF , typename U> HURCHALLA_FORCE_INLINE
63101 static typename MF::MontgomeryValue call (const MF& mf, U n)
64102 {
65- return impl_montgomery_two_pow::call<true , 0 , 3 , MF, U >(mf, n);
103+ return impl_montgomery_two_pow::call<MF, U, false , 0 , 22 , false >(mf, n);
66104 }
67105 template <class MF , typename U, std::size_t ARRAY_SIZE> HURCHALLA_FORCE_INLINE
68106 static std::array<typename MF::MontgomeryValue, ARRAY_SIZE>
69107 call (const std::array<MF, ARRAY_SIZE>& mf, const std::array<U, ARRAY_SIZE>& n)
70108 {
71- return impl_montgomery_two_pow::call<0 , 2 , MF, U, ARRAY_SIZE >(mf, n);
109+ return impl_montgomery_two_pow::call<MF, U, ARRAY_SIZE, 0 , 30 , false >(mf, n);
72110 }
73111};
74- // Partial specialization: clang and small uint pow.
112+
113+
114+ // Partial specialization: gcc and big uint pow.
115+ // Intended for MontgomeryFull, but catches all non-specialized monty types
75116template <class MontyTag > struct tagged_montgomery_two_pow
76- <MontyTag, Tag_montgomery_two_pow_clang, Tag_montgomery_two_pow_small>
117+ <MontyTag, Tag_montgomery_two_pow_gcc, Tag_montgomery_two_pow_big>
118+ {
119+ template <class MF , typename U> HURCHALLA_FORCE_INLINE
120+ static typename MF::MontgomeryValue call (const MF& mf, U n)
121+ {
122+ return impl_montgomery_two_pow::call<MF, U, false , 0 , 33 , true >(mf, n);
123+ }
124+ template <class MF , typename U, std::size_t ARRAY_SIZE> HURCHALLA_FORCE_INLINE
125+ static std::array<typename MF::MontgomeryValue, ARRAY_SIZE>
126+ call (const std::array<MF, ARRAY_SIZE>& mf, const std::array<U, ARRAY_SIZE>& n)
127+ {
128+ return impl_montgomery_two_pow::call<MF, U, ARRAY_SIZE, 0 , 30 , false >(mf, n);
129+ }
130+ };
131+ // Full Specialization: gcc and big uint pow and MontgomeryHalf.
132+ template <> struct tagged_montgomery_two_pow
133+ <TagMontyHalfrange, Tag_montgomery_two_pow_gcc, Tag_montgomery_two_pow_big>
134+ {
135+ template <class MF , typename U> HURCHALLA_FORCE_INLINE
136+ static typename MF::MontgomeryValue call (const MF& mf, U n)
137+ {
138+ return impl_montgomery_two_pow::call<MF, U, false , 0 , 33 , false >(mf, n);
139+ }
140+ template <class MF , typename U, std::size_t ARRAY_SIZE> HURCHALLA_FORCE_INLINE
141+ static std::array<typename MF::MontgomeryValue, ARRAY_SIZE>
142+ call (const std::array<MF, ARRAY_SIZE>& mf, const std::array<U, ARRAY_SIZE>& n)
143+ {
144+ return impl_montgomery_two_pow::call<MF, U, ARRAY_SIZE, 0 , 31 , false >(mf, n);
145+ }
146+ };
147+ // Full Specialization: gcc and big uint pow and MontgomeryQuarter.
148+ template <> struct tagged_montgomery_two_pow
149+ <TagMontyQuarterrange, Tag_montgomery_two_pow_gcc, Tag_montgomery_two_pow_big>
77150{
78151 template <class MF , typename U> HURCHALLA_FORCE_INLINE
79152 static typename MF::MontgomeryValue call (const MF& mf, U n)
80153 {
81- return impl_montgomery_two_pow::call<true , 0 , 1 , MF, U >(mf, n);
154+ return impl_montgomery_two_pow::call<MF, U, false , 0 , 22 , false >(mf, n);
82155 }
83156 template <class MF , typename U, std::size_t ARRAY_SIZE> HURCHALLA_FORCE_INLINE
84157 static std::array<typename MF::MontgomeryValue, ARRAY_SIZE>
85158 call (const std::array<MF, ARRAY_SIZE>& mf, const std::array<U, ARRAY_SIZE>& n)
86159 {
87- return impl_montgomery_two_pow::call<0 , 0 , MF, U, ARRAY_SIZE >(mf, n);
160+ return impl_montgomery_two_pow::call<MF, U, ARRAY_SIZE, 0 , 31 , false >(mf, n);
88161 }
89162};
90163
91164
92- // Full specialization: gcc and big uint pow and MontgomeryQuarter.
165+
166+ // Partial Specialization: clang and small uint pow.
167+ // Intended for MontgomeryFull, but catches all non-specialized monty types
168+ template <class MontyTag > struct tagged_montgomery_two_pow
169+ <MontyTag, Tag_montgomery_two_pow_clang, Tag_montgomery_two_pow_small>
170+ {
171+ template <class MF , typename U> HURCHALLA_FORCE_INLINE
172+ static typename MF::MontgomeryValue call (const MF& mf, U n)
173+ {
174+ return impl_montgomery_two_pow::call<MF, U, false , 0 , 33 , true >(mf, n);
175+ }
176+ template <class MF , typename U, std::size_t ARRAY_SIZE> HURCHALLA_FORCE_INLINE
177+ static std::array<typename MF::MontgomeryValue, ARRAY_SIZE>
178+ call (const std::array<MF, ARRAY_SIZE>& mf, const std::array<U, ARRAY_SIZE>& n)
179+ {
180+ return impl_montgomery_two_pow::call<MF, U, ARRAY_SIZE, 0 , 31 , false >(mf, n);
181+ }
182+ };
183+ // Full Specialization: clang and small uint pow and MontgomeryHalf.
93184template <> struct tagged_montgomery_two_pow
94- <TagMontyQuarterrange, Tag_montgomery_two_pow_gcc, Tag_montgomery_two_pow_big >
185+ <TagMontyHalfrange, Tag_montgomery_two_pow_clang, Tag_montgomery_two_pow_small >
95186{
96187 template <class MF , typename U> HURCHALLA_FORCE_INLINE
97188 static typename MF::MontgomeryValue call (const MF& mf, U n)
98189 {
99- return impl_montgomery_two_pow::call<true , 0 , 1 , MF, U >(mf, n);
190+ return impl_montgomery_two_pow::call<MF, U, false , 0 , 24 , false >(mf, n);
100191 }
101192 template <class MF , typename U, std::size_t ARRAY_SIZE> HURCHALLA_FORCE_INLINE
102193 static std::array<typename MF::MontgomeryValue, ARRAY_SIZE>
103194 call (const std::array<MF, ARRAY_SIZE>& mf, const std::array<U, ARRAY_SIZE>& n)
104195 {
105- return impl_montgomery_two_pow::call<0 , 0 , MF, U, ARRAY_SIZE >(mf, n);
196+ return impl_montgomery_two_pow::call<MF, U, ARRAY_SIZE, 0 , 29 , false >(mf, n);
106197 }
107198};
108- // Partial specialization: gcc and big uint pow.
109- template <class MontyTag > struct tagged_montgomery_two_pow
110- <MontyTag, Tag_montgomery_two_pow_gcc, Tag_montgomery_two_pow_big >
199+ // Full Specialization: clang and small uint pow and MontgomeryQuarter .
200+ template <> struct tagged_montgomery_two_pow
201+ <TagMontyQuarterrange, Tag_montgomery_two_pow_clang, Tag_montgomery_two_pow_small >
111202{
112203 template <class MF , typename U> HURCHALLA_FORCE_INLINE
113204 static typename MF::MontgomeryValue call (const MF& mf, U n)
114205 {
115- return impl_montgomery_two_pow::call<false , 0 , 2 , MF, U >(mf, n);
206+ return impl_montgomery_two_pow::call<MF, U, false , 0 , 24 , false >(mf, n);
116207 }
117208 template <class MF , typename U, std::size_t ARRAY_SIZE> HURCHALLA_FORCE_INLINE
118209 static std::array<typename MF::MontgomeryValue, ARRAY_SIZE>
119210 call (const std::array<MF, ARRAY_SIZE>& mf, const std::array<U, ARRAY_SIZE>& n)
120211 {
121- return impl_montgomery_two_pow::call<0 , 0 , MF, U, ARRAY_SIZE >(mf, n);
212+ return impl_montgomery_two_pow::call<MF, U, ARRAY_SIZE, 0 , 31 , false >(mf, n);
122213 }
123214};
124- // Partial specialization: gcc and small uint pow.
215+
216+
217+ // Partial Specialization: gcc and small uint pow and MontgomeryFull.
218+ // Intended for MontgomeryFull, but catches all non-specialized monty types
125219template <class MontyTag > struct tagged_montgomery_two_pow
126- <MontyTag, Tag_montgomery_two_pow_gcc, Tag_montgomery_two_pow_small>
220+ <MontyTag, Tag_montgomery_two_pow_gcc, Tag_montgomery_two_pow_small>
127221{
128222 template <class MF , typename U> HURCHALLA_FORCE_INLINE
129223 static typename MF::MontgomeryValue call (const MF& mf, U n)
130224 {
131- return impl_montgomery_two_pow::call<true , 0 , 3 , MF, U >(mf, n);
225+ return impl_montgomery_two_pow::call<MF, U, false , 0 , 23 , true >(mf, n);
132226 }
133227 template <class MF , typename U, std::size_t ARRAY_SIZE> HURCHALLA_FORCE_INLINE
134228 static std::array<typename MF::MontgomeryValue, ARRAY_SIZE>
135229 call (const std::array<MF, ARRAY_SIZE>& mf, const std::array<U, ARRAY_SIZE>& n)
136230 {
137- return impl_montgomery_two_pow::call<0 , 0 , MF, U, ARRAY_SIZE >(mf, n);
231+ return impl_montgomery_two_pow::call<MF, U, ARRAY_SIZE, 0 , 31 , false >(mf, n);
138232 }
139233};
234+ // Full Specialization: gcc and small uint pow and MontgomeryHalf.
235+ template <> struct tagged_montgomery_two_pow
236+ <TagMontyHalfrange, Tag_montgomery_two_pow_gcc, Tag_montgomery_two_pow_small>
237+ {
238+ template <class MF , typename U> HURCHALLA_FORCE_INLINE
239+ static typename MF::MontgomeryValue call (const MF& mf, U n)
240+ {
241+ return impl_montgomery_two_pow::call<MF, U, false , 0 , 24 , false >(mf, n);
242+ }
243+ template <class MF , typename U, std::size_t ARRAY_SIZE> HURCHALLA_FORCE_INLINE
244+ static std::array<typename MF::MontgomeryValue, ARRAY_SIZE>
245+ call (const std::array<MF, ARRAY_SIZE>& mf, const std::array<U, ARRAY_SIZE>& n)
246+ {
247+ return impl_montgomery_two_pow::call<MF, U, ARRAY_SIZE, 0 , 28 , false >(mf, n);
248+ }
249+ };
250+ // Full Specialization: gcc and small uint pow and MontgomeryQuarter.
251+ template <> struct tagged_montgomery_two_pow
252+ <TagMontyQuarterrange, Tag_montgomery_two_pow_gcc, Tag_montgomery_two_pow_small>
253+ {
254+ template <class MF , typename U> HURCHALLA_FORCE_INLINE
255+ static typename MF::MontgomeryValue call (const MF& mf, U n)
256+ {
257+ return impl_montgomery_two_pow::call<MF, U, false , 0 , 24 , false >(mf, n);
258+ }
259+ template <class MF , typename U, std::size_t ARRAY_SIZE> HURCHALLA_FORCE_INLINE
260+ static std::array<typename MF::MontgomeryValue, ARRAY_SIZE>
261+ call (const std::array<MF, ARRAY_SIZE>& mf, const std::array<U, ARRAY_SIZE>& n)
262+ {
263+ return impl_montgomery_two_pow::call<MF, U, ARRAY_SIZE, 0 , 29 , false >(mf, n);
264+ }
265+ };
266+
140267
141268
142269
@@ -145,6 +272,7 @@ struct montgomery_two_pow {
145272 // Calculate pow(2, n), modulo the modulus of mf, and return the result in
146273 // montgomeryform representation.
147274 template <class MF , typename T>
275+ HURCHALLA_FORCE_INLINE
148276 static typename MF::MontgomeryValue call (const MF& mf, T nt)
149277 {
150278 HPBC_CLOCKWORK_PRECONDITION (nt >= 0 );
@@ -169,8 +297,8 @@ struct montgomery_two_pow {
169297
170298 // Helper function - delegated Array version of montgomery two pow
171299 template <class MF , typename U, std::size_t ARRAY_SIZE>
172- static std::array<typename MF::MontgomeryValue, ARRAY_SIZE>
173300 HURCHALLA_FORCE_INLINE
301+ static std::array<typename MF::MontgomeryValue, ARRAY_SIZE>
174302 helper (const std::array<MF,ARRAY_SIZE>& mf, const std::array<U,ARRAY_SIZE>& n)
175303 {
176304 static_assert (hurchalla::ut_numeric_limits<U>::is_integer, " " );
@@ -194,18 +322,18 @@ struct montgomery_two_pow {
194322
195323 // Array version of montgomery two pow, for unsigned T
196324 template <class MF , typename T, std::size_t ARRAY_SIZE>
197- static
325+ HURCHALLA_FORCE_INLINE static
198326 typename std::enable_if<!(hurchalla::ut_numeric_limits<T>::is_signed),
199327 std::array<typename MF::MontgomeryValue, ARRAY_SIZE>
200328 >::type
201329 call (const std::array<MF,ARRAY_SIZE>& mf, const std::array<T,ARRAY_SIZE>& nt)
202330 {
203- return helper<MF, T, ARRAY_SIZE> (mf, nt);
331+ return helper (mf, nt);
204332 }
205333
206334 // Array version of montgomery two pow, for signed T
207335 template <class MF , typename T, std::size_t ARRAY_SIZE>
208- static
336+ HURCHALLA_FORCE_INLINE static
209337 typename std::enable_if<(hurchalla::ut_numeric_limits<T>::is_signed),
210338 std::array<typename MF::MontgomeryValue, ARRAY_SIZE>
211339 >::type
@@ -217,7 +345,7 @@ struct montgomery_two_pow {
217345 HPBC_CLOCKWORK_PRECONDITION (nt[i] >= 0 );
218346 n[i] = static_cast <U>(nt[i]);
219347 }
220- return helper<MF, U, ARRAY_SIZE> (mf, n);
348+ return helper (mf, n);
221349 }
222350};
223351
0 commit comments