@@ -14,162 +14,164 @@ cpufeatures::new!(sha1_hwcap, "sha2");
1414unsafe fn compress_sha1_neon ( state : & mut [ u32 ; 5 ] , blocks : & [ [ u8 ; 64 ] ] ) {
1515 use core:: arch:: aarch64:: * ;
1616
17- let mut abcd = vld1q_u32 ( state. as_ptr ( ) ) ;
18- let mut e0 = state[ 4 ] ;
19- let [ k0, k1, k2, k3] = K . map ( |k| vdupq_n_u32 ( k) ) ;
20- let ( mut e1, mut tmp0, mut tmp1) ;
21-
22- for block in blocks {
23- let abcd_cpy = abcd;
24- let e0_cpy = e0;
25-
26- // Load and reverse byte order
27- let [ mut msg0, mut msg1, mut msg2, mut msg3] = [ 0 , 1 , 2 , 3 ] . map ( |i| {
28- let p = block. as_ptr ( ) . add ( 16 * i) ;
29- vreinterpretq_u32_u8 ( vrev32q_u8 ( vld1q_u8 ( p) ) )
30- } ) ;
31-
32- tmp0 = vaddq_u32 ( msg0, k0) ;
33- tmp1 = vaddq_u32 ( msg1, k0) ;
34-
35- // Rounds 0-3
36- e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
37- abcd = vsha1cq_u32 ( abcd, e0, tmp0) ;
38- tmp0 = vaddq_u32 ( msg2, k0) ;
39- msg0 = vsha1su0q_u32 ( msg0, msg1, msg2) ;
40-
41- // Rounds 4-7
42- e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
43- abcd = vsha1cq_u32 ( abcd, e1, tmp1) ;
44- tmp1 = vaddq_u32 ( msg3, k0) ;
45- msg0 = vsha1su1q_u32 ( msg0, msg3) ;
46- msg1 = vsha1su0q_u32 ( msg1, msg2, msg3) ;
47-
48- // Rounds 8-11
49- e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
50- abcd = vsha1cq_u32 ( abcd, e0, tmp0) ;
51- tmp0 = vaddq_u32 ( msg0, k0) ;
52- msg1 = vsha1su1q_u32 ( msg1, msg0) ;
53- msg2 = vsha1su0q_u32 ( msg2, msg3, msg0) ;
54-
55- // Rounds 12-15
56- e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
57- abcd = vsha1cq_u32 ( abcd, e1, tmp1) ;
58- tmp1 = vaddq_u32 ( msg1, k1) ;
59- msg2 = vsha1su1q_u32 ( msg2, msg1) ;
60- msg3 = vsha1su0q_u32 ( msg3, msg0, msg1) ;
61-
62- // Rounds 16-19
63- e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
64- abcd = vsha1cq_u32 ( abcd, e0, tmp0) ;
65- tmp0 = vaddq_u32 ( msg2, k1) ;
66- msg3 = vsha1su1q_u32 ( msg3, msg2) ;
67- msg0 = vsha1su0q_u32 ( msg0, msg1, msg2) ;
68-
69- // Rounds 20-23
70- e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
71- abcd = vsha1pq_u32 ( abcd, e1, tmp1) ;
72- tmp1 = vaddq_u32 ( msg3, k1) ;
73- msg0 = vsha1su1q_u32 ( msg0, msg3) ;
74- msg1 = vsha1su0q_u32 ( msg1, msg2, msg3) ;
75-
76- // Rounds 24-27
77- e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
78- abcd = vsha1pq_u32 ( abcd, e0, tmp0) ;
79- tmp0 = vaddq_u32 ( msg0, k1) ;
80- msg1 = vsha1su1q_u32 ( msg1, msg0) ;
81- msg2 = vsha1su0q_u32 ( msg2, msg3, msg0) ;
82-
83- // Rounds 28-31
84- e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
85- abcd = vsha1pq_u32 ( abcd, e1, tmp1) ;
86- tmp1 = vaddq_u32 ( msg1, k1) ;
87- msg2 = vsha1su1q_u32 ( msg2, msg1) ;
88- msg3 = vsha1su0q_u32 ( msg3, msg0, msg1) ;
89-
90- // Rounds 32-35
91- e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
92- abcd = vsha1pq_u32 ( abcd, e0, tmp0) ;
93- tmp0 = vaddq_u32 ( msg2, k2) ;
94- msg3 = vsha1su1q_u32 ( msg3, msg2) ;
95- msg0 = vsha1su0q_u32 ( msg0, msg1, msg2) ;
96-
97- // Rounds 36-39
98- e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
99- abcd = vsha1pq_u32 ( abcd, e1, tmp1) ;
100- tmp1 = vaddq_u32 ( msg3, k2) ;
101- msg0 = vsha1su1q_u32 ( msg0, msg3) ;
102- msg1 = vsha1su0q_u32 ( msg1, msg2, msg3) ;
103-
104- // Rounds 40-43
105- e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
106- abcd = vsha1mq_u32 ( abcd, e0, tmp0) ;
107- tmp0 = vaddq_u32 ( msg0, k2) ;
108- msg1 = vsha1su1q_u32 ( msg1, msg0) ;
109- msg2 = vsha1su0q_u32 ( msg2, msg3, msg0) ;
110-
111- // Rounds 44-47
112- e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
113- abcd = vsha1mq_u32 ( abcd, e1, tmp1) ;
114- tmp1 = vaddq_u32 ( msg1, k2) ;
115- msg2 = vsha1su1q_u32 ( msg2, msg1) ;
116- msg3 = vsha1su0q_u32 ( msg3, msg0, msg1) ;
117-
118- // Rounds 48-51
119- e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
120- abcd = vsha1mq_u32 ( abcd, e0, tmp0) ;
121- tmp0 = vaddq_u32 ( msg2, k2) ;
122- msg3 = vsha1su1q_u32 ( msg3, msg2) ;
123- msg0 = vsha1su0q_u32 ( msg0, msg1, msg2) ;
124-
125- // Rounds 52-55
126- e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
127- abcd = vsha1mq_u32 ( abcd, e1, tmp1) ;
128- tmp1 = vaddq_u32 ( msg3, k3) ;
129- msg0 = vsha1su1q_u32 ( msg0, msg3) ;
130- msg1 = vsha1su0q_u32 ( msg1, msg2, msg3) ;
131-
132- // Rounds 56-59
133- e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
134- abcd = vsha1mq_u32 ( abcd, e0, tmp0) ;
135- tmp0 = vaddq_u32 ( msg0, k3) ;
136- msg1 = vsha1su1q_u32 ( msg1, msg0) ;
137- msg2 = vsha1su0q_u32 ( msg2, msg3, msg0) ;
138-
139- // Rounds 60-63
140- e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
141- abcd = vsha1pq_u32 ( abcd, e1, tmp1) ;
142- tmp1 = vaddq_u32 ( msg1, k3) ;
143- msg2 = vsha1su1q_u32 ( msg2, msg1) ;
144- msg3 = vsha1su0q_u32 ( msg3, msg0, msg1) ;
145-
146- // Rounds 64-67
147- e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
148- abcd = vsha1pq_u32 ( abcd, e0, tmp0) ;
149- tmp0 = vaddq_u32 ( msg2, k3) ;
150- msg3 = vsha1su1q_u32 ( msg3, msg2) ;
151-
152- // Rounds 68-71
153- e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
154- abcd = vsha1pq_u32 ( abcd, e1, tmp1) ;
155- tmp1 = vaddq_u32 ( msg3, k3) ;
156-
157- // Rounds 72-75
158- e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
159- abcd = vsha1pq_u32 ( abcd, e0, tmp0) ;
160-
161- // Rounds 76-79
162- e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
163- abcd = vsha1pq_u32 ( abcd, e1, tmp1) ;
164-
165- // Update state
166- abcd = vaddq_u32 ( abcd_cpy, abcd) ;
167- e0 = e0. wrapping_add ( e0_cpy) ;
168- }
17+ unsafe {
18+ let mut abcd = vld1q_u32 ( state. as_ptr ( ) ) ;
19+ let mut e0 = state[ 4 ] ;
20+ let [ k0, k1, k2, k3] = K . map ( |k| vdupq_n_u32 ( k) ) ;
21+ let ( mut e1, mut tmp0, mut tmp1) ;
22+
23+ for block in blocks {
24+ let abcd_cpy = abcd;
25+ let e0_cpy = e0;
26+
27+ // Load and reverse byte order
28+ let [ mut msg0, mut msg1, mut msg2, mut msg3] = [ 0 , 1 , 2 , 3 ] . map ( |i| {
29+ let p = block. as_ptr ( ) . add ( 16 * i) ;
30+ vreinterpretq_u32_u8 ( vrev32q_u8 ( vld1q_u8 ( p) ) )
31+ } ) ;
32+
33+ tmp0 = vaddq_u32 ( msg0, k0) ;
34+ tmp1 = vaddq_u32 ( msg1, k0) ;
35+
36+ // Rounds 0-3
37+ e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
38+ abcd = vsha1cq_u32 ( abcd, e0, tmp0) ;
39+ tmp0 = vaddq_u32 ( msg2, k0) ;
40+ msg0 = vsha1su0q_u32 ( msg0, msg1, msg2) ;
41+
42+ // Rounds 4-7
43+ e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
44+ abcd = vsha1cq_u32 ( abcd, e1, tmp1) ;
45+ tmp1 = vaddq_u32 ( msg3, k0) ;
46+ msg0 = vsha1su1q_u32 ( msg0, msg3) ;
47+ msg1 = vsha1su0q_u32 ( msg1, msg2, msg3) ;
48+
49+ // Rounds 8-11
50+ e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
51+ abcd = vsha1cq_u32 ( abcd, e0, tmp0) ;
52+ tmp0 = vaddq_u32 ( msg0, k0) ;
53+ msg1 = vsha1su1q_u32 ( msg1, msg0) ;
54+ msg2 = vsha1su0q_u32 ( msg2, msg3, msg0) ;
55+
56+ // Rounds 12-15
57+ e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
58+ abcd = vsha1cq_u32 ( abcd, e1, tmp1) ;
59+ tmp1 = vaddq_u32 ( msg1, k1) ;
60+ msg2 = vsha1su1q_u32 ( msg2, msg1) ;
61+ msg3 = vsha1su0q_u32 ( msg3, msg0, msg1) ;
62+
63+ // Rounds 16-19
64+ e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
65+ abcd = vsha1cq_u32 ( abcd, e0, tmp0) ;
66+ tmp0 = vaddq_u32 ( msg2, k1) ;
67+ msg3 = vsha1su1q_u32 ( msg3, msg2) ;
68+ msg0 = vsha1su0q_u32 ( msg0, msg1, msg2) ;
69+
70+ // Rounds 20-23
71+ e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
72+ abcd = vsha1pq_u32 ( abcd, e1, tmp1) ;
73+ tmp1 = vaddq_u32 ( msg3, k1) ;
74+ msg0 = vsha1su1q_u32 ( msg0, msg3) ;
75+ msg1 = vsha1su0q_u32 ( msg1, msg2, msg3) ;
76+
77+ // Rounds 24-27
78+ e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
79+ abcd = vsha1pq_u32 ( abcd, e0, tmp0) ;
80+ tmp0 = vaddq_u32 ( msg0, k1) ;
81+ msg1 = vsha1su1q_u32 ( msg1, msg0) ;
82+ msg2 = vsha1su0q_u32 ( msg2, msg3, msg0) ;
83+
84+ // Rounds 28-31
85+ e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
86+ abcd = vsha1pq_u32 ( abcd, e1, tmp1) ;
87+ tmp1 = vaddq_u32 ( msg1, k1) ;
88+ msg2 = vsha1su1q_u32 ( msg2, msg1) ;
89+ msg3 = vsha1su0q_u32 ( msg3, msg0, msg1) ;
90+
91+ // Rounds 32-35
92+ e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
93+ abcd = vsha1pq_u32 ( abcd, e0, tmp0) ;
94+ tmp0 = vaddq_u32 ( msg2, k2) ;
95+ msg3 = vsha1su1q_u32 ( msg3, msg2) ;
96+ msg0 = vsha1su0q_u32 ( msg0, msg1, msg2) ;
97+
98+ // Rounds 36-39
99+ e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
100+ abcd = vsha1pq_u32 ( abcd, e1, tmp1) ;
101+ tmp1 = vaddq_u32 ( msg3, k2) ;
102+ msg0 = vsha1su1q_u32 ( msg0, msg3) ;
103+ msg1 = vsha1su0q_u32 ( msg1, msg2, msg3) ;
104+
105+ // Rounds 40-43
106+ e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
107+ abcd = vsha1mq_u32 ( abcd, e0, tmp0) ;
108+ tmp0 = vaddq_u32 ( msg0, k2) ;
109+ msg1 = vsha1su1q_u32 ( msg1, msg0) ;
110+ msg2 = vsha1su0q_u32 ( msg2, msg3, msg0) ;
111+
112+ // Rounds 44-47
113+ e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
114+ abcd = vsha1mq_u32 ( abcd, e1, tmp1) ;
115+ tmp1 = vaddq_u32 ( msg1, k2) ;
116+ msg2 = vsha1su1q_u32 ( msg2, msg1) ;
117+ msg3 = vsha1su0q_u32 ( msg3, msg0, msg1) ;
118+
119+ // Rounds 48-51
120+ e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
121+ abcd = vsha1mq_u32 ( abcd, e0, tmp0) ;
122+ tmp0 = vaddq_u32 ( msg2, k2) ;
123+ msg3 = vsha1su1q_u32 ( msg3, msg2) ;
124+ msg0 = vsha1su0q_u32 ( msg0, msg1, msg2) ;
125+
126+ // Rounds 52-55
127+ e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
128+ abcd = vsha1mq_u32 ( abcd, e1, tmp1) ;
129+ tmp1 = vaddq_u32 ( msg3, k3) ;
130+ msg0 = vsha1su1q_u32 ( msg0, msg3) ;
131+ msg1 = vsha1su0q_u32 ( msg1, msg2, msg3) ;
132+
133+ // Rounds 56-59
134+ e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
135+ abcd = vsha1mq_u32 ( abcd, e0, tmp0) ;
136+ tmp0 = vaddq_u32 ( msg0, k3) ;
137+ msg1 = vsha1su1q_u32 ( msg1, msg0) ;
138+ msg2 = vsha1su0q_u32 ( msg2, msg3, msg0) ;
139+
140+ // Rounds 60-63
141+ e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
142+ abcd = vsha1pq_u32 ( abcd, e1, tmp1) ;
143+ tmp1 = vaddq_u32 ( msg1, k3) ;
144+ msg2 = vsha1su1q_u32 ( msg2, msg1) ;
145+ msg3 = vsha1su0q_u32 ( msg3, msg0, msg1) ;
146+
147+ // Rounds 64-67
148+ e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
149+ abcd = vsha1pq_u32 ( abcd, e0, tmp0) ;
150+ tmp0 = vaddq_u32 ( msg2, k3) ;
151+ msg3 = vsha1su1q_u32 ( msg3, msg2) ;
152+
153+ // Rounds 68-71
154+ e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
155+ abcd = vsha1pq_u32 ( abcd, e1, tmp1) ;
156+ tmp1 = vaddq_u32 ( msg3, k3) ;
157+
158+ // Rounds 72-75
159+ e1 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
160+ abcd = vsha1pq_u32 ( abcd, e0, tmp0) ;
161+
162+ // Rounds 76-79
163+ e0 = vsha1h_u32 ( vgetq_lane_u32 ( abcd, 0 ) ) ;
164+ abcd = vsha1pq_u32 ( abcd, e1, tmp1) ;
165+
166+ // Update state
167+ abcd = vaddq_u32 ( abcd_cpy, abcd) ;
168+ e0 = e0. wrapping_add ( e0_cpy) ;
169+ }
169170
170- // Save state
171- vst1q_u32 ( state. as_mut_ptr ( ) , abcd) ;
172- state[ 4 ] = e0;
171+ // Save state
172+ vst1q_u32 ( state. as_mut_ptr ( ) , abcd) ;
173+ state[ 4 ] = e0;
174+ }
173175}
174176
175177pub fn compress ( state : & mut [ u32 ; 5 ] , blocks : & [ [ u8 ; 64 ] ] ) {
0 commit comments