Skip to content

Commit f974f03

Browse files
Raimo33Claudio Raimondi
authored andcommitted
Add simd to scalar_4x64, Add TODOs
1 parent 58316e9 commit f974f03

File tree

5 files changed

+114
-5
lines changed

5 files changed

+114
-5
lines changed

src/field_5x52_impl.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,17 @@
77
#ifndef SECP256K1_FIELD_REPR_IMPL_H
88
#define SECP256K1_FIELD_REPR_IMPL_H
99

10-
#ifdef X86
11-
# include <immintrin.h>
12-
#endif
13-
1410
#include "checkmem.h"
1511
#include "util.h"
1612
#include "field.h"
1713
#include "modinv64_impl.h"
1814

1915
#include "field_5x52_int128_impl.h"
2016

17+
#ifdef X86
18+
# include <immintrin.h>
19+
#endif
20+
2121
#ifdef VERIFY
2222
static void secp256k1_fe_impl_verify(const secp256k1_fe *a) {
2323
const uint64_t *d = a->n;

src/hash_impl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414
#include <stdint.h>
1515
#include <string.h>
1616

17+
#ifdef X86
18+
# include <immintrin.h>
19+
#endif
20+
1721
#define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
1822
#define Maj(x,y,z) (((x) & (y)) | ((z) & ((x) | (y))))
1923
#define Sigma0(x) (((x) >> 2 | (x) << 30) ^ ((x) >> 13 | (x) << 19) ^ ((x) >> 22 | (x) << 10))

src/modinv32_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ static void secp256k1_modinv32_mul_30(secp256k1_modinv32_signed30 *r, const secp
4040
/* Return -1 for a<b*factor, 0 for a==b*factor, 1 for a>b*factor. A consists of alen limbs; b has 9. */
4141
static int secp256k1_modinv32_mul_cmp_30(const secp256k1_modinv32_signed30 *a, int alen, const secp256k1_modinv32_signed30 *b, int32_t factor) {
4242
int i;
43+
int diff;
4344
secp256k1_modinv32_signed30 am, bm;
4445
secp256k1_modinv32_mul_30(&am, a, alen, 1); /* Normalize all but the top limb of a. */
4546
secp256k1_modinv32_mul_30(&bm, b, 9, factor);

src/scalar_4x64_impl.h

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212
#include "modinv64_impl.h"
1313
#include "util.h"
1414

15+
#ifdef X86
16+
# include <immintrin.h>
17+
#endif
18+
1519
/* Limbs of the secp256k1 order. */
1620
#define SECP256K1_N_0 ((uint64_t)0xBFD25E8CD0364141ULL)
1721
#define SECP256K1_N_1 ((uint64_t)0xBAAEDCE6AF48A03BULL)
@@ -143,6 +147,7 @@ static void secp256k1_scalar_cadd_bit(secp256k1_scalar *r, unsigned int bit, int
143147

144148
static void secp256k1_scalar_set_b32(secp256k1_scalar *r, const unsigned char *b32, int *overflow) {
145149
int over;
150+
146151
r->d[0] = secp256k1_read_be64(&b32[24]);
147152
r->d[1] = secp256k1_read_be64(&b32[16]);
148153
r->d[2] = secp256k1_read_be64(&b32[8]);
@@ -866,14 +871,27 @@ static void secp256k1_scalar_mul(secp256k1_scalar *r, const secp256k1_scalar *a,
866871
static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *k) {
867872
SECP256K1_SCALAR_VERIFY(k);
868873

874+
#ifdef __AVX2__
875+
{
876+
__m128i k_01 = _mm_loadu_si128((__m128i *)k->d);
877+
__m128i k_23 = _mm_loadu_si128((__m128i *)(k->d + 2));
878+
const __m128i zeros = _mm_setzero_si128(); /* TODO: precompute */
879+
_mm_storeu_si128((__m128i*)(r1->d + 2), zeros);
880+
_mm_storeu_si128((__m128i*)(r2->d + 2), zeros);
881+
_mm_storeu_si128((__m128i*)r1->d, k_01);
882+
_mm_storeu_si128((__m128i*)r2->d, k_23);
883+
}
884+
#else
869885
r1->d[0] = k->d[0];
870886
r1->d[1] = k->d[1];
871887
r1->d[2] = 0;
872888
r1->d[3] = 0;
889+
873890
r2->d[0] = k->d[2];
874891
r2->d[1] = k->d[3];
875892
r2->d[2] = 0;
876893
r2->d[3] = 0;
894+
#endif
877895

878896
SECP256K1_SCALAR_VERIFY(r1);
879897
SECP256K1_SCALAR_VERIFY(r2);
@@ -883,7 +901,19 @@ SECP256K1_INLINE static int secp256k1_scalar_eq(const secp256k1_scalar *a, const
883901
SECP256K1_SCALAR_VERIFY(a);
884902
SECP256K1_SCALAR_VERIFY(b);
885903

886-
return ((a->d[0] ^ b->d[0]) | (a->d[1] ^ b->d[1]) | (a->d[2] ^ b->d[2]) | (a->d[3] ^ b->d[3])) == 0;
904+
#ifdef __AVX2__
905+
{
906+
__m256i vec_a = _mm256_loadu_si256((__m256i *)a->d);
907+
__m256i vec_b = _mm256_loadu_si256((__m256i *)b->d);
908+
__m256i vec_xor = _mm256_xor_si256(vec_a, vec_b);
909+
return _mm256_testz_si256(vec_xor, vec_xor);
910+
}
911+
#else
912+
return ( (a->d[0] ^ b->d[0]) |
913+
(a->d[1] ^ b->d[1]) |
914+
(a->d[2] ^ b->d[2]) |
915+
(a->d[3] ^ b->d[3]) ) == 0;
916+
#endif
887917
}
888918

889919
SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b, unsigned int shift) {
@@ -899,6 +929,9 @@ SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar *r,
899929
shiftlimbs = shift >> 6;
900930
shiftlow = shift & 0x3F;
901931
shifthigh = 64 - shiftlow;
932+
933+
/* TODO: parallel? */
934+
902935
r->d[0] = shift < 512 ? (l[0 + shiftlimbs] >> shiftlow | (shift < 448 && shiftlow ? (l[1 + shiftlimbs] << shifthigh) : 0)) : 0;
903936
r->d[1] = shift < 448 ? (l[1 + shiftlimbs] >> shiftlow | (shift < 384 && shiftlow ? (l[2 + shiftlimbs] << shifthigh) : 0)) : 0;
904937
r->d[2] = shift < 384 ? (l[2 + shiftlimbs] >> shiftlow | (shift < 320 && shiftlow ? (l[3 + shiftlimbs] << shifthigh) : 0)) : 0;
@@ -909,17 +942,34 @@ SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar *r,
909942
}
910943

911944
static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const secp256k1_scalar *a, int flag) {
945+
#ifdef __AVX2__
946+
/* load here to mitigate load latency */
947+
__m256i vec_r = _mm256_loadu_si256((__m256i *)(r->d));
948+
__m256i vec_a = _mm256_loadu_si256((__m256i *)(a->d));
949+
#endif
912950
uint64_t mask0, mask1;
913951
volatile int vflag = flag;
914952
SECP256K1_SCALAR_VERIFY(a);
915953
SECP256K1_CHECKMEM_CHECK_VERIFY(r->d, sizeof(r->d));
916954

917955
mask0 = vflag + ~((uint64_t)0);
918956
mask1 = ~mask0;
957+
958+
#ifdef __AVX2__
959+
{
960+
const __m256i vec_mask0 = _mm256_set1_epi64x(mask0); /* TODO: precompute*/
961+
const __m256i vec_mask1 = _mm256_set1_epi64x(mask1); /* TODO: precompute*/
962+
vec_r = _mm256_and_si256(vec_r, vec_mask0);
963+
vec_a = _mm256_and_si256(vec_a, vec_mask1);
964+
vec_r = _mm256_or_si256(vec_r, vec_a);
965+
_mm256_storeu_si256((__m256i *)(r->d), vec_r);
966+
}
967+
#else
919968
r->d[0] = (r->d[0] & mask0) | (a->d[0] & mask1);
920969
r->d[1] = (r->d[1] & mask0) | (a->d[1] & mask1);
921970
r->d[2] = (r->d[2] & mask0) | (a->d[2] & mask1);
922971
r->d[3] = (r->d[3] & mask0) | (a->d[3] & mask1);
972+
#endif
923973

924974
SECP256K1_SCALAR_VERIFY(r);
925975
}
@@ -936,10 +986,23 @@ static void secp256k1_scalar_from_signed62(secp256k1_scalar *r, const secp256k1_
936986
VERIFY_CHECK(a3 >> 62 == 0);
937987
VERIFY_CHECK(a4 >> 8 == 0);
938988

989+
#ifdef __AVX2__
990+
{
991+
__m256i limbs_0123 = _mm256_setr_epi64x(a0, a1, a2, a3);
992+
__m256i limbs_1234 = _mm256_setr_epi64x(a1, a2, a3, a4);
993+
const __m256i shift_lhs = _mm256_setr_epi64x(0, 2, 4, 6);
994+
const __m256i shift_rhs = _mm256_setr_epi64x(62, 60, 58, 56);
995+
__m256i lhs = _mm256_srlv_epi64(limbs_0123, shift_lhs);
996+
__m256i rhs = _mm256_sllv_epi64(limbs_1234, shift_rhs);
997+
__m256i out = _mm256_or_si256(lhs, rhs);
998+
_mm256_storeu_si256((__m256i *)(r->d), out);
999+
}
1000+
#else
9391001
r->d[0] = a0 | a1 << 62;
9401002
r->d[1] = a1 >> 2 | a2 << 60;
9411003
r->d[2] = a2 >> 4 | a3 << 58;
9421004
r->d[3] = a3 >> 6 | a4 << 56;
1005+
#endif
9431006

9441007
SECP256K1_SCALAR_VERIFY(r);
9451008
}
@@ -949,10 +1012,25 @@ static void secp256k1_scalar_to_signed62(secp256k1_modinv64_signed62 *r, const s
9491012
const uint64_t a0 = a->d[0], a1 = a->d[1], a2 = a->d[2], a3 = a->d[3];
9501013
SECP256K1_SCALAR_VERIFY(a);
9511014

1015+
#ifdef __AVX2__
1016+
{
1017+
__m256i limbs_0012 = _mm256_setr_epi64x(a0, a0, a1, a2);
1018+
__m256i limbs_0123 = _mm256_setr_epi64x(a0, a1, a2, a3);
1019+
const __m256i shift_lhs = _mm256_setr_epi64x(0, 62, 60, 58); /*TODO: precompute */
1020+
const __m256i shift_rhs = _mm256_setr_epi64x(64, 2, 4, 6); /*TODO: precompute */
1021+
const __m256i mask62 = _mm256_set1_epi64x(M62); /*TODO: precompute */
1022+
__m256i lhs = _mm256_srlv_epi64(limbs_0012, shift_lhs);
1023+
__m256i rhs = _mm256_sllv_epi64(limbs_0123, shift_rhs);
1024+
__m256i out = _mm256_or_si256(lhs, rhs);
1025+
out = _mm256_and_si256(out, mask62);
1026+
_mm256_storeu_si256((__m256i *)r->v, out);
1027+
}
1028+
#else
9521029
r->v[0] = a0 & M62;
9531030
r->v[1] = (a0 >> 62 | a1 << 2) & M62;
9541031
r->v[2] = (a1 >> 60 | a2 << 4) & M62;
9551032
r->v[3] = (a2 >> 58 | a3 << 6) & M62;
1033+
#endif
9561034
r->v[4] = a3 >> 56;
9571035
}
9581036

src/scalar_8x32_impl.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@
3939
#define SECP256K1_N_H_7 ((uint32_t)0x7FFFFFFFUL)
4040

4141
SECP256K1_INLINE static void secp256k1_scalar_set_int(secp256k1_scalar *r, unsigned int v) {
42+
43+
/* TODO: parallel, SSE2 (32bit cpu only) */
44+
4245
r->d[0] = v;
4346
r->d[1] = 0;
4447
r->d[2] = 0;
@@ -150,6 +153,8 @@ static void secp256k1_scalar_cadd_bit(secp256k1_scalar *r, unsigned int bit, int
150153
SECP256K1_SCALAR_VERIFY(r);
151154
VERIFY_CHECK(bit < 256);
152155

156+
/* TODO: parallel, SSE2 (32bit cpu only) */
157+
153158
bit += ((uint32_t) vflag - 1) & 0x100; /* forcing (bit >> 5) > 7 makes this a noop */
154159
t = (uint64_t)r->d[0] + (((uint32_t)((bit >> 5) == 0)) << (bit & 0x1F));
155160
r->d[0] = t & 0xFFFFFFFFULL; t >>= 32;
@@ -174,6 +179,9 @@ static void secp256k1_scalar_cadd_bit(secp256k1_scalar *r, unsigned int bit, int
174179

175180
static void secp256k1_scalar_set_b32(secp256k1_scalar *r, const unsigned char *b32, int *overflow) {
176181
int over;
182+
183+
/* TODO: parallel, SSE2 (32bit cpu only) */
184+
177185
r->d[0] = secp256k1_read_be32(&b32[28]);
178186
r->d[1] = secp256k1_read_be32(&b32[24]);
179187
r->d[2] = secp256k1_read_be32(&b32[20]);
@@ -193,6 +201,8 @@ static void secp256k1_scalar_set_b32(secp256k1_scalar *r, const unsigned char *b
193201
static void secp256k1_scalar_get_b32(unsigned char *bin, const secp256k1_scalar* a) {
194202
SECP256K1_SCALAR_VERIFY(a);
195203

204+
/* TODO: parallel, SSE2 (32bit cpu only) */
205+
196206
secp256k1_write_be32(&bin[0], a->d[7]);
197207
secp256k1_write_be32(&bin[4], a->d[6]);
198208
secp256k1_write_be32(&bin[8], a->d[5]);
@@ -206,6 +216,8 @@ static void secp256k1_scalar_get_b32(unsigned char *bin, const secp256k1_scalar*
206216
SECP256K1_INLINE static int secp256k1_scalar_is_zero(const secp256k1_scalar *a) {
207217
SECP256K1_SCALAR_VERIFY(a);
208218

219+
/* TODO: parallel, SSE2 (32bit cpu only) */
220+
209221
return (a->d[0] | a->d[1] | a->d[2] | a->d[3] | a->d[4] | a->d[5] | a->d[6] | a->d[7]) == 0;
210222
}
211223

@@ -214,6 +226,8 @@ static void secp256k1_scalar_negate(secp256k1_scalar *r, const secp256k1_scalar
214226
uint64_t t = (uint64_t)(~a->d[0]) + SECP256K1_N_0 + 1;
215227
SECP256K1_SCALAR_VERIFY(a);
216228

229+
/* TODO: parallel, SSE2 (32bit cpu only) */
230+
217231
r->d[0] = t & nonzero; t >>= 32;
218232
t += (uint64_t)(~a->d[1]) + SECP256K1_N_1;
219233
r->d[1] = t & nonzero; t >>= 32;
@@ -284,6 +298,8 @@ static void secp256k1_scalar_half(secp256k1_scalar *r, const secp256k1_scalar *a
284298
SECP256K1_INLINE static int secp256k1_scalar_is_one(const secp256k1_scalar *a) {
285299
SECP256K1_SCALAR_VERIFY(a);
286300

301+
/* TODO: parallel, SSE2 (32bit cpu only) */
302+
287303
return ((a->d[0] ^ 1) | a->d[1] | a->d[2] | a->d[3] | a->d[4] | a->d[5] | a->d[6] | a->d[7]) == 0;
288304
}
289305

@@ -652,6 +668,8 @@ static void secp256k1_scalar_mul(secp256k1_scalar *r, const secp256k1_scalar *a,
652668
static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *k) {
653669
SECP256K1_SCALAR_VERIFY(k);
654670

671+
/* TODO: parallel, SSE2 (32bit cpu only) */
672+
655673
r1->d[0] = k->d[0];
656674
r1->d[1] = k->d[1];
657675
r1->d[2] = k->d[2];
@@ -689,6 +707,8 @@ SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar *r,
689707
SECP256K1_SCALAR_VERIFY(b);
690708
VERIFY_CHECK(shift >= 256);
691709

710+
/* TODO: parallel, SSE2 (32bit cpu only) */
711+
692712
secp256k1_scalar_mul_512(l, a, b);
693713
shiftlimbs = shift >> 5;
694714
shiftlow = shift & 0x1F;
@@ -712,6 +732,8 @@ static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const se
712732
SECP256K1_SCALAR_VERIFY(a);
713733
SECP256K1_CHECKMEM_CHECK_VERIFY(r->d, sizeof(r->d));
714734

735+
/* TODO: parallel, SSE2 (32bit cpu only) */
736+
715737
mask0 = vflag + ~((uint32_t)0);
716738
mask1 = ~mask0;
717739
r->d[0] = (r->d[0] & mask0) | (a->d[0] & mask1);
@@ -743,6 +765,8 @@ static void secp256k1_scalar_from_signed30(secp256k1_scalar *r, const secp256k1_
743765
VERIFY_CHECK(a7 >> 30 == 0);
744766
VERIFY_CHECK(a8 >> 16 == 0);
745767

768+
/* TODO: parallel, SSE2 (32bit cpu only) */
769+
746770
r->d[0] = a0 | a1 << 30;
747771
r->d[1] = a1 >> 2 | a2 << 28;
748772
r->d[2] = a2 >> 4 | a3 << 26;
@@ -761,6 +785,8 @@ static void secp256k1_scalar_to_signed30(secp256k1_modinv32_signed30 *r, const s
761785
a4 = a->d[4], a5 = a->d[5], a6 = a->d[6], a7 = a->d[7];
762786
SECP256K1_SCALAR_VERIFY(a);
763787

788+
/* TODO: parallel, SSE2 (32bit cpu only) */
789+
764790
r->v[0] = a0 & M30;
765791
r->v[1] = (a0 >> 30 | a1 << 2) & M30;
766792
r->v[2] = (a1 >> 28 | a2 << 4) & M30;

0 commit comments

Comments
 (0)