Skip to content

Commit ab9fa6b

Browse files
sipafjahr
authored andcommitted
Add x86_64 assembly optimization for MuHash
1 parent f8bcc45 commit ab9fa6b

File tree

1 file changed

+50
-0
lines changed

1 file changed

+50
-0
lines changed

src/crypto/muhash.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,54 @@ namespace {
2828
c1 = 0; \
2929
}
3030

31+
#if defined(__amd64__) || defined(__x86_64__)
32+
33+
/** [c0,c1] = a * b */
34+
#define mul(c0,c1,a,b) { \
35+
__asm__ ("mulq %3" : "=d"(c1), "=a"(c0) : "a"(a), "g"(b) : "cc"); \
36+
}
37+
38+
/** [c0,c1,c2] += a * b */
39+
#define muladd3(c0,c1,c2,a,b) { \
40+
uint64_t tl, th; \
41+
__asm__ ("mulq %3" : "=a"(tl), "=d"(th) : "a"(a), "g"(b) : "cc"); \
42+
__asm__ ("addq %3,%0; adcq %4,%1; adcq $0,%2" : "+r"(c0), "+r"(c1), "+r"(c2) : "a"(tl), "d"(th) : "cc"); \
43+
}
44+
45+
/** [c0,c1,c2] += 2 * a * b */
46+
#define muldbladd3(c0,c1,c2,a,b) { \
47+
uint64_t tl, th; \
48+
__asm__ ("mulq %3" : "=a"(tl), "=d"(th) : "a"(a), "g"(b) : "cc"); \
49+
__asm__ ("addq %3,%0; adcq %4,%1; adcq $0,%2" : "+r"(c0), "+r"(c1), "+r"(c2) : "a"(tl), "d"(th) : "cc"); \
50+
__asm__ ("addq %3,%0; adcq %4,%1; adcq $0,%2" : "+r"(c0), "+r"(c1), "+r"(c2) : "a"(tl), "d"(th) : "cc"); \
51+
}
52+
53+
/* [c0,c1,c2] += n * [d0,d1,d2]. c0 is initially 0 */
54+
#define mulnadd3(c0,c1,c2,d0,d1,d2,n) { \
55+
uint64_t tl1, th1, tl2, th2, tl3; \
56+
__asm__ ("mulq %3" : "=a"(tl1), "=d"(th1) : "a"(d0), "r"((Num3072::limb_type)n) : "cc"); \
57+
__asm__ ("addq %3,%0; adcq %4,%1; adcq $0,%2" : "+r"(c0), "+r"(c1), "+r"(c2) : "g"(tl1), "g"(th1) : "cc"); \
58+
__asm__ ("mulq %3" : "=a"(tl2), "=d"(th2) : "a"(d1), "r"((Num3072::limb_type)n) : "cc"); \
59+
__asm__ ("addq %2,%0; adcq %3,%1" : "+r"(c1), "+r"(c2) : "g"(tl2), "g"(th2) : "cc"); \
60+
__asm__ ("imulq %2,%1,%0" : "=r"(tl3) : "g"(d2), "i"(n) : "cc"); \
61+
__asm__ ("addq %1,%0" : "+r"(c2) : "g"(tl3) : "cc"); \
62+
}
63+
64+
/* [c0,c1] *= n */
65+
#define muln2(c0,c1,n) { \
66+
uint64_t th; \
67+
__asm__ ("mulq %2" : "+a"(c0), "=d"(th) : "r"((Num3072::limb_type)n) : "cc"); \
68+
__asm__ ("imul %1,%0,%0" : "+r"(c1) : "i"(n) : "cc"); \
69+
__asm__ ("addq %1,%0" : "+r"(c1) : "g"(th) : "cc"); \
70+
}
71+
72+
/** [c0,c1] += a */
73+
#define add2(c0,c1,a) { \
74+
__asm__ ("add %2,%0; adc $0,%1" : "+r"(c0), "+r"(c1) : "r"(a) : "cc"); \
75+
}
76+
77+
#else
78+
3179
/** [c0,c1] = a * b */
3280
#define mul(c0,c1,a,b) { \
3381
Num3072::double_limb_type t = (Num3072::double_limb_type)a * b; \
@@ -95,6 +143,8 @@ namespace {
95143
c1 += (c0 < (a)) ? 1 : 0; \
96144
}
97145

146+
#endif
147+
98148
bool IsOverflow(const Num3072* d)
99149
{
100150
for (int i = 1; i < Num3072::LIMBS - 1; ++i) {

0 commit comments

Comments
 (0)