Skip to content

Commit 84f8feb

Browse files
committed
finish adding 128bit asm for x86
1 parent c7f7c7b commit 84f8feb

File tree

2 files changed

+123
-7
lines changed

2 files changed

+123
-7
lines changed

montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/detail/platform_specific/two_times_restricted.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ struct two_times_restricted_unsigned {
108108
defined(HURCHALLA_ALLOW_INLINE_ASM_MODADD)) && \
109109
defined(HURCHALLA_TARGET_ISA_X86_64) && !defined(_MSC_VER)
110110

111-
#ifdef HURCHALLA_ENABLE_INLINE_ASM_128_BIT
111+
# if (HURCHALLA_COMPILER_HAS_UINT128_T())
112112
template <>
113113
struct two_times_restricted_unsigned<__uint128_t> {
114114
HURCHALLA_FORCE_INLINE static
@@ -133,11 +133,11 @@ struct two_times_restricted_unsigned<__uint128_t> {
133133
"cmovbq %[tmplo], %[sumlo] \n\t" /* sum = (sum<m) ? tmp : tmp2 */
134134
"cmovbq %[tmphi], %[sumhi] \n\t"
135135
: [sumlo]"+&r"(sumlo), [sumhi]"+&r"(sumhi)
136-
# if defined(__clang__) /* https://bugs.llvm.org/show_bug.cgi?id=20197 */
136+
# if defined(__clang__) /* https://bugs.llvm.org/show_bug.cgi?id=20197 */
137137
: [mlo]"r"(mlo), [mhi]"r"(mhi), [tmplo]"r"(tmplo), [tmphi]"r"(tmphi)
138-
# else
138+
# else
139139
: [mlo]"rm"(mlo), [mhi]"rm"(mhi), [tmplo]"rm"(tmplo), [tmphi]"rm"(tmphi)
140-
# endif
140+
# endif
141141
: "cc");
142142
__uint128_t result = (static_cast<__uint128_t>(sumhi) << 64) | sumlo;
143143

@@ -147,7 +147,7 @@ struct two_times_restricted_unsigned<__uint128_t> {
147147
return result;
148148
}
149149
};
150-
#endif
150+
# endif
151151

152152
template <>
153153
struct two_times_restricted_unsigned<std::uint64_t> {
@@ -227,7 +227,7 @@ struct two_times_restricted_unsigned<std::uint32_t> {
227227
#if (defined(HURCHALLA_ALLOW_INLINE_ASM_ALL) || \
228228
defined(HURCHALLA_ALLOW_INLINE_ASM_ABSDIFF)) && \
229229
defined(HURCHALLA_TARGET_ISA_ARM_64) && !defined(_MSC_VER)
230-
# if defined(HURCHALLA_ENABLE_INLINE_ASM_128_BIT) && (HURCHALLA_COMPILER_HAS_UINT128_T())
230+
# if (HURCHALLA_COMPILER_HAS_UINT128_T())
231231
*/
232232
template <>
233233
struct two_times_restricted_unsigned<__uint128_t> {

montgomery_arithmetic/include/hurchalla/montgomery_arithmetic/low_level_api/detail/platform_specific/ImplRedc.h

Lines changed: 117 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,17 +192,28 @@ struct DefaultRedcStandard
192192
static HURCHALLA_FORCE_INLINE
193193
T call(T u_hi, T u_lo, T n, T inv_n)
194194
{
195+
#if defined(HURCHALLA_AVOID_CSELECT)
195196
bool ovf;
196197
T result = RedcIncomplete::call(ovf, u_hi, u_lo, n, inv_n);
197198
// By RedcIncomplete::call()'s Postcondition #1, we get
198199
// T final_result = (ovf) ? static_cast<T>(result + n) : result;
199-
#if defined(HURCHALLA_AVOID_CSELECT)
200200
T mask = static_cast<T>(-static_cast<T>(ovf));
201201
T final_result = static_cast<T>(result + (mask & n));
202202
#else
203+
# if 0
204+
bool ovf;
205+
T result = RedcIncomplete::call(ovf, u_hi, u_lo, n, inv_n);
203206
// final_result = (ovf) ? static_cast<T>(result+n) : result
204207
T final_result = ::hurchalla::conditional_select(
205208
ovf, static_cast<T>(result+n), result);
209+
# else
210+
using P = typename safely_promote_unsigned<T>::type;
211+
T m = static_cast<T>(static_cast<P>(u_lo) * static_cast<P>(inv_n));
212+
T mn_lo;
213+
T mn_hi = ::hurchalla::unsigned_multiply_to_hilo_product(mn_lo, m, n);
214+
T final_result = ::hurchalla::modular_subtraction_prereduced_inputs
215+
<T,::hurchalla::LowuopsTag>(u_hi, mn_hi, n);
216+
# endif
206217
#endif
207218
HPBC_POSTCONDITION2(final_result < n);
208219
return final_result;
@@ -232,6 +243,65 @@ struct RedcStandard
232243
#if (defined(HURCHALLA_ALLOW_INLINE_ASM_ALL) || \
233244
defined(HURCHALLA_ALLOW_INLINE_ASM_REDC)) && \
234245
defined(HURCHALLA_TARGET_ISA_X86_64) && !defined(_MSC_VER)
246+
247+
# if (HURCHALLA_COMPILER_HAS_UINT128_T())
248+
// specialization for __uint128_t (for x86_64)
249+
template <>
250+
struct RedcStandard<__uint128_t>
251+
{
252+
using T = __uint128_t;
253+
254+
static HURCHALLA_FORCE_INLINE
255+
T call(T u_hi, T u_lo, T n, T inv_n, LowlatencyTag)
256+
{
257+
using P = typename safely_promote_unsigned<T>::type;
258+
// see uint64_t version's comments for explanations
259+
HPBC_PRECONDITION2(u_hi < n);
260+
HPBC_PRECONDITION2(
261+
static_cast<T>(static_cast<P>(n) * static_cast<P>(inv_n)) == 1);
262+
HPBC_PRECONDITION2(n % 2 == 1);
263+
HPBC_PRECONDITION2(n > 1);
264+
265+
T m = static_cast<T>(static_cast<P>(u_lo) * static_cast<P>(inv_n));
266+
T mn_lo;
267+
T mn_hi = ::hurchalla::unsigned_multiply_to_hilo_product(mn_lo, m, n);
268+
HPBC_ASSERT2(mn_hi < n);
269+
T reg = u_hi + n;
270+
271+
using std::uint64_t;
272+
uint64_t reglo = static_cast<uint64_t>(reg);
273+
uint64_t reghi = static_cast<uint64_t>(reg >> 64);
274+
uint64_t uhilo = static_cast<uint64_t>(u_hi);
275+
uint64_t uhihi = static_cast<uint64_t>(u_hi >> 64);
276+
uint64_t mnhilo = static_cast<uint64_t>(mn_hi);
277+
uint64_t mnhihi = static_cast<uint64_t>(mn_hi >> 64);
278+
__asm__ (
279+
"subq %[mnhilo], %[reglo] \n\t" /* reg = u_hi + n - mn_hi */
280+
"sbbq %[mnhihi], %[reghi] \n\t"
281+
"subq %[mnhilo], %[uhilo] \n\t" /* t_hi = u_hi - mn_hi */
282+
"sbbq %[mnhihi], %[uhihi] \n\t"
283+
"cmovaeq %[uhilo], %[reglo] \n\t" /* reg = (u_hi >= mn_hi) ? t_hi : reg */
284+
"cmovaeq %[uhihi], %[reghi] \n\t"
285+
: [reglo]"+&r"(reglo), [reghi]"+&r"(reghi), [uhilo]"+&r"(uhilo), [uhihi]"+&r"(uhihi)
286+
: [mnhilo]"r"(mnhilo), [mnhihi]"r"(mnhihi)
287+
: "cc");
288+
T result = (static_cast<__uint128_t>(reghi) << 64) | reglo;
289+
HPBC_ASSERT2(result == DefaultRedcStandard<T>::call(u_hi, u_lo, n, inv_n));
290+
HPBC_POSTCONDITION2(result < n);
291+
return result;
292+
}
293+
294+
static HURCHALLA_FORCE_INLINE
295+
T call(T u_hi, T u_lo, T n, T inv_n, LowuopsTag)
296+
{
297+
T result = DefaultRedcStandard<T>::call(u_hi, u_lo, n, inv_n);
298+
HPBC_POSTCONDITION2(result < n);
299+
return result;
300+
}
301+
};
302+
# endif
303+
304+
235305
// specialization for uint64_t (for x86_64)
236306
template <>
237307
struct RedcStandard<std::uint64_t>
@@ -286,6 +356,52 @@ struct RedcStandard<std::uint64_t>
286356
return result;
287357
}
288358
};
359+
360+
361+
// specialization for uint32_t
362+
template <>
363+
struct RedcStandard<std::uint32_t>
364+
{
365+
using T = std::uint32_t;
366+
367+
static HURCHALLA_FORCE_INLINE
368+
T call(T u_hi, T u_lo, T n, T inv_n, LowlatencyTag)
369+
{
370+
using P = typename safely_promote_unsigned<T>::type;
371+
// see uint64_t version's comments for explanations
372+
HPBC_PRECONDITION2(u_hi < n);
373+
HPBC_PRECONDITION2(
374+
static_cast<T>(static_cast<P>(n) * static_cast<P>(inv_n)) == 1);
375+
HPBC_PRECONDITION2(n % 2 == 1);
376+
HPBC_PRECONDITION2(n > 1);
377+
378+
T m = static_cast<T>(static_cast<P>(u_lo) * static_cast<P>(inv_n));
379+
T mn_lo;
380+
T mn_hi = ::hurchalla::unsigned_multiply_to_hilo_product(mn_lo, m, n);
381+
HPBC_ASSERT2(mn_hi < n);
382+
T reg = u_hi + n;
383+
T uhi = u_hi;
384+
__asm__ (
385+
"subl %[mnhi], %[reg] \n\t" /* reg = u_hi + n - mn_hi */
386+
"subl %[mnhi], %[uhi] \n\t" /* t_hi = u_hi - mn_hi */
387+
"cmovael %[uhi], %[reg] \n\t" /* reg = (u_hi >= mn_hi) ? t_hi : reg */
388+
: [reg]"+&r"(reg), [uhi]"+&r"(uhi)
389+
: [mnhi]"r"(mn_hi)
390+
: "cc");
391+
T result = reg;
392+
HPBC_ASSERT2(result == DefaultRedcStandard<T>::call(u_hi, u_lo, n, inv_n));
393+
HPBC_POSTCONDITION2(result < n);
394+
return result;
395+
}
396+
397+
static HURCHALLA_FORCE_INLINE
398+
T call(T u_hi, T u_lo, T n, T inv_n, LowuopsTag)
399+
{
400+
T result = DefaultRedcStandard<T>::call(u_hi, u_lo, n, inv_n);
401+
HPBC_POSTCONDITION2(result < n);
402+
return result;
403+
}
404+
};
289405
#endif // (defined(HURCHALLA_ALLOW_INLINE_ASM_ALL) ||
290406
// defined(HURCHALLA_ALLOW_INLINE_ASM_REDC)) &&
291407
// defined(HURCHALLA_TARGET_ISA_X86_64) && !defined(_MSC_VER)

0 commit comments

Comments
 (0)