1
1
using LoopVectorization
2
2
using Test
3
3
4
- using CpuId, VectorizationBase, SIMDPirates, SLEEFPirates
5
- @generated function estimate_cost (f:: F , N:: Int = 512 , K = 1_000 , :: Type{T} = Float64, :: Val{U} = Val (4 )) where {F,T,U}
4
+ using CpuId, VectorizationBase, SIMDPirates, SLEEFPirates, VectorizedRNG
5
+
6
+ @generated function estimate_cost_onearg_serial (f:: F , N:: Int = 512 , K = 1_000 , :: Type{T} = Float64, :: Val{U} = Val (4 )) where {F,T,U}
7
+ quote
8
+ Base. Cartesian. @nexprs $ U u -> s_u = zero (T)
9
+ # s = vbroadcast(V, zero(T))
10
+ x = rand (T, N)
11
+ ptrx = pointer (x)
12
+ ts_start, id_start = cpucycle_id ()
13
+ @inbounds for k ∈ 1 : K
14
+ i = 1
15
+ for n ∈ 1 : N>> $ (VectorizationBase. intlog2 (U))
16
+ Base. Cartesian. @nexprs $ U u -> begin
17
+ v_u = x[i]
18
+ i += 1
19
+ s_u += f (v_u)
20
+ end
21
+ end
22
+ end
23
+ ts_end, id_end = cpucycle_id ()
24
+ @assert id_start == id_end
25
+ Base. Cartesian. @nexprs $ (U- 1 ) u -> s_1 += s_{u+ 1 }
26
+ (ts_end - ts_start) / (N* K), s_1
27
+ end
28
+ end
29
+ @generated function estimate_cost_onearg_tworet_serial (f:: F , N:: Int = 512 , K = 1_000 , :: Type{T} = Float64, :: Val{U} = Val (4 )) where {F,T,U}
30
+ quote
31
+ Base. Cartesian. @nexprs $ U u -> s_u = zero (T)
32
+ # s = vbroadcast(V, zero(T))
33
+ x = rand (T, N)
34
+ ptrx = pointer (x)
35
+ ts_start, id_start = cpucycle_id ()
36
+ @inbounds for k ∈ 1 : K
37
+ i = 1
38
+ for n ∈ 1 : N>> $ (VectorizationBase. intlog2 (U))
39
+ Base. Cartesian. @nexprs $ U u -> begin
40
+ v_u = x[i]
41
+ i += 1
42
+ a_u, b_u = f (v_u)
43
+ s_u = muladd (a_u,b_u,s_u)
44
+ end
45
+ end
46
+ end
47
+ ts_end, id_end = cpucycle_id ()
48
+ @assert id_start == id_end
49
+ Base. Cartesian. @nexprs $ (U- 1 ) u -> s_1 += s_{u+ 1 }
50
+ (ts_end - ts_start) / (N* K), s_1
51
+ end
52
+ end
53
+ @generated function estimate_cost_onearg (f:: F , N:: Int = 512 , K = 1_000 , :: Type{T} = Float64, :: Val{U} = Val (4 )) where {F,T,U}
6
54
W, Wshift = VectorizationBase. pick_vector_width_shift (T)
7
55
quote
8
56
Base. Cartesian. @nexprs $ U u -> s_u = vbroadcast (Vec{$ W,$ T}, zero (T))
@@ -29,23 +77,239 @@ using CpuId, VectorizationBase, SIMDPirates, SLEEFPirates
29
77
(ts_end - ts_start) / (N* K), vsum (s_1)
30
78
end
31
79
end
32
- estimate_cost (SLEEFPirates. exp, 512 , 1_000 , Float64, Val (4 )) # 28
33
-
34
- estimate_cost (SLEEFPirates. log, 512 , 1_000 , Float64, Val (1 )) # 51 cycles
35
- estimate_cost (SLEEFPirates. log, 512 , 1_000 , Float64, Val (2 )) # 51 cycles
36
- estimate_cost (SLEEFPirates. log, 512 , 1_000 , Float64, Val (4 )) # 51 cycles
37
- estimate_cost (SIMDPirates. vsqrt, 512 , 1_000 , Float64, Val (1 )) # 23 cycles
38
- estimate_cost (SIMDPirates. vsqrt, 512 , 1_000 , Float64, Val (2 )) # 23 cycles
39
- estimate_cost (SIMDPirates. vsqrt, 512 , 1_000 , Float64, Val (4 )) # 23 cycles
40
- estimate_cost (SIMDPirates. vinv, 512 , 1_000 , Float64, Val (1 )) # 23 cycles
41
- estimate_cost (SIMDPirates. vinv, 512 , 1_000 , Float64, Val (2 )) # 23 cycles
42
- estimate_cost (SIMDPirates. vinv, 512 , 1_000 , Float64, Val (4 )) # 23 cycles
80
+ @generated function estimate_cost_onearg_tworet (f:: F , N:: Int = 512 , K = 1_000 , :: Type{T} = Float64, :: Val{U} = Val (4 )) where {F,T,U}
81
+ W, Wshift = VectorizationBase. pick_vector_width_shift (T)
82
+ quote
83
+ Base. Cartesian. @nexprs $ U u -> s_u = vbroadcast (Vec{$ W,$ T}, zero (T))
84
+ # s = vbroadcast(V, zero(T))
85
+ x = rand (T, N << $ Wshift)
86
+ ptrx = pointer (x)
87
+ ts_start, id_start = cpucycle_id ()
88
+ for k ∈ 1 : K
89
+ _ptrx = ptrx
90
+ for n ∈ 1 : N>> $ (VectorizationBase. intlog2 (U))
91
+ Base. Cartesian. @nexprs $ U u -> begin
92
+ v_u = vload (Vec{$ W,$ T}, _ptrx)
93
+ a_u, b_u = f (v_u)
94
+ s_u = vmuladd (a_u, b_u, s_u)
95
+ _ptrx += VectorizationBase. REGISTER_SIZE
96
+ end
97
+ # v = vload(V, _ptrx)
98
+ # s = vadd(s, f(v))
99
+ # _ptrx += VectorizationBase.REGISTER_SIZE
100
+ end
101
+ end
102
+ ts_end, id_end = cpucycle_id ()
103
+ @assert id_start == id_end
104
+ Base. Cartesian. @nexprs $ (U- 1 ) u -> s_1 = vadd (s_1, s_{u+ 1 })
105
+ (ts_end - ts_start) / (N* K), vsum (s_1)
106
+ end
107
+ end
108
+ @generated function estimate_cost_twoarg (f:: F , N:: Int = 512 , K = 1_000 , :: Type{T} = Float64, :: Val{U} = Val (4 )) where {F,T,U}
109
+ W, Wshift = VectorizationBase. pick_vector_width_shift (T)
110
+ if U == 1
111
+ return quote
112
+ Base. Cartesian. @nexprs $ U u -> s_u = vbroadcast (Vec{$ W,$ T}, one (T))
113
+ # s = vbroadcast(V, zero(T))
114
+ x = rand (T, N << $ Wshift)
115
+ ptrx = pointer (x)
116
+ ts_start, id_start = cpucycle_id ()
117
+ for k ∈ 1 : K
118
+ _ptrx = ptrx
119
+ for n ∈ 1 : N>> $ (VectorizationBase. intlog2 (U))
120
+ Base. Cartesian. @nexprs $ U u -> begin
121
+ v_u = vload (Vec{$ W,$ T}, _ptrx)
122
+ s_u = f (s_u, v_u)
123
+ _ptrx += VectorizationBase. REGISTER_SIZE
124
+ end
125
+ # v = vload(V, _ptrx)
126
+ # s = vadd(s, f(v))
127
+ # _ptrx += VectorizationBase.REGISTER_SIZE
128
+ end
129
+ end
130
+ ts_end, id_end = cpucycle_id ()
131
+ @assert id_start == id_end
132
+ Base. Cartesian. @nexprs $ (U- 1 ) u -> s_1 = vadd (s_1, s_{u+ 1 })
133
+ (ts_end - ts_start) / (N* K), vsum (s_1)
134
+ end
135
+ end
136
+ Uh = U >>> 1
137
+ quote
138
+ Base. Cartesian. @nexprs $ (U << 1 ) u -> s_u = randn (VectorizedRNG. GLOBAL_vPCG, Vec{$ W,$ T}) # vbroadcast(Vec{$W,$T}, one(T))
139
+ # s = vbroadcast(V, zero(T))
140
+ x = rand (T, N << $ Wshift)
141
+ ptrx = pointer (x)
142
+ ts_start, id_start = cpucycle_id ()
143
+ for k ∈ 1 : K
144
+ _ptrx = ptrx
145
+ for n ∈ 1 : N>> $ (VectorizationBase. intlog2 (U))
146
+ Base. Cartesian. @nexprs $ Uh u -> begin
147
+ v_u = vload (Vec{$ W,$ T}, _ptrx)
148
+ _ptrx += VectorizationBase. REGISTER_SIZE
149
+ v_{u+ $ Uh} = vload (Vec{$ W,$ T}, _ptrx)
150
+ _ptrx += VectorizationBase. REGISTER_SIZE
151
+ # vv_u = vmul(v_u, v_{u+$Uh})
152
+ s_u = f (s_u, v_u)
153
+ s_{u+ $ Uh} = f (s_{u+ $ Uh}, v_{u+ $ Uh})
154
+ s_{u+ $ U} = f (s_{u+ $ U}, v_u)
155
+ s_{u+ $ (Uh+ U)} = f (s_{u+ $ (Uh+ U)}, v_{u+ $ Uh})
156
+ end
157
+ # v = vload(V, _ptrx)
158
+ # s = vadd(s, f(v))
159
+ # _ptrx += VectorizationBase.REGISTER_SIZE
160
+ end
161
+ end
162
+ ts_end, id_end = cpucycle_id ()
163
+ @assert id_start == id_end
164
+ Base. Cartesian. @nexprs $ ((U<< 1 )- 1 ) u -> s_1 = vadd (s_1, s_{u+ 1 })
165
+ (ts_end - ts_start) / (2 N* K), vsum (s_1)
166
+ end
167
+ end
168
+ @generated function estimate_cost_threearg (f:: F , N:: Int = 512 , K = 1_000 , :: Type{T} = Float64, :: Val{U} = Val (4 )) where {F,T,U}
169
+ W, Wshift = VectorizationBase. pick_vector_width_shift (T)
170
+ if U == 1
171
+ return quote
172
+ Base. Cartesian. @nexprs $ U u -> s_u = vbroadcast (Vec{$ W,$ T}, zero (T))
173
+ # s = vbroadcast(V, zero(T))
174
+ x = rand (T, N << $ Wshift)
175
+ ptrx = pointer (x)
176
+ ts_start, id_start = cpucycle_id ()
177
+ for k ∈ 1 : K
178
+ _ptrx = ptrx
179
+ for n ∈ 1 : N>> $ (VectorizationBase. intlog2 (U))
180
+ Base. Cartesian. @nexprs $ U u -> begin
181
+ v_u = vload (Vec{$ W,$ T}, _ptrx)
182
+ s_u = f (v_u, v_u, s_u)
183
+ _ptrx += VectorizationBase. REGISTER_SIZE
184
+ end
185
+ # v = vload(V, _ptrx)
186
+ # s = vadd(s, f(v))
187
+ # _ptrx += VectorizationBase.REGISTER_SIZE
188
+ end
189
+ end
190
+ ts_end, id_end = cpucycle_id ()
191
+ @assert id_start == id_end
192
+ Base. Cartesian. @nexprs $ (U- 1 ) u -> s_1 = vadd (s_1, s_{u+ 1 })
193
+ (ts_end - ts_start) / (N* K), vsum (s_1)
194
+ end
195
+ end
196
+ Uh = U >>> 1
197
+ quote
198
+ Base. Cartesian. @nexprs $ (U<< 1 ) u -> s_u = vbroadcast (Vec{$ W,$ T}, zero (T))
199
+ # s = vbroadcast(V, zero(T))
200
+ x = rand (T, N << $ Wshift)
201
+ ptrx = pointer (x)
202
+ ts_start, id_start = cpucycle_id ()
203
+ for k ∈ 1 : K
204
+ _ptrx = ptrx
205
+ for n ∈ 1 : N>> $ (VectorizationBase. intlog2 (U))
206
+ Base. Cartesian. @nexprs $ Uh u -> begin
207
+ v_u = vload (Vec{$ W,$ T}, _ptrx)
208
+ _ptrx += VectorizationBase. REGISTER_SIZE
209
+ v_{u+ $ Uh} = vload (Vec{$ W,$ T}, _ptrx)
210
+ _ptrx += VectorizationBase. REGISTER_SIZE
211
+ s_u = f (v_u, v_u, s_u)
212
+ s_{u+ $ Uh} = f (v_{u+ $ Uh}, v_{u+ $ Uh}, s_{u+ $ Uh})
213
+ s_{u+ $ U} = f (v_u, v_{u+ $ Uh}, s_{u+ $ U})
214
+ s_{u+ $ (Uh+ U)} = f (v_u, v_{u+ $ Uh}, s_{u+ $ (Uh+ U)})
215
+ end
216
+ # v = vload(V, _ptrx)
217
+ # s = vadd(s, f(v))
218
+ # _ptrx += VectorizationBase.REGISTER_SIZE
219
+ end
220
+ end
221
+ ts_end, id_end = cpucycle_id ()
222
+ @assert id_start == id_end
223
+ Base. Cartesian. @nexprs $ ((U<< 1 ) - 1 ) u -> s_1 = vadd (s_1, s_{u+ 1 })
224
+ (ts_end - ts_start) / (2 N* K), vsum (s_1)
225
+ end
226
+ end
227
+ estimate_cost_onearg_serial (exp, 512 , 1_000 , Float64, Val (1 )) # 21
228
+ estimate_cost_onearg_serial (exp, 512 , 1_000 , Float64, Val (2 )) # 18.4
229
+ estimate_cost_onearg_serial (exp, 512 , 1_000 , Float64, Val (4 )) # 17.5
230
+
231
+ estimate_cost_onearg_serial (log, 512 , 1_000 , Float64, Val (1 )) # 22
232
+ estimate_cost_onearg_serial (log, 512 , 1_000 , Float64, Val (2 )) # 19
233
+ estimate_cost_onearg_serial (log, 512 , 1_000 , Float64, Val (4 )) # 19
234
+
235
+ estimate_cost_onearg_serial (Base. FastMath. sqrt_fast, 512 , 1_000 , Float64, Val (1 )) # 5
236
+ estimate_cost_onearg_serial (Base. FastMath. sqrt_fast, 512 , 1_000 , Float64, Val (2 )) # 2.5 # SIMD
237
+ estimate_cost_onearg_serial (Base. FastMath. sqrt_fast, 512 , 1_000 , Float64, Val (4 )) # 1.25 # SIMD
238
+ @code_native debuginfo= :none estimate_cost_onearg_serial (Base. FastMath. sqrt_fast, 512 , 1_000 , Float64, Val (4 )) # 1.25
239
+ estimate_cost_onearg_serial (sqrt, 512 , 1_000 , Float64, Val (1 )) # 5
240
+ estimate_cost_onearg_serial (sqrt, 512 , 1_000 , Float64, Val (2 )) # 2.5 # SIMD
241
+ estimate_cost_onearg_serial (sqrt, 512 , 1_000 , Float64, Val (4 )) # 1.25 # SIMD
242
+ @code_native debuginfo= :none estimate_cost_onearg_serial (sqrt, 512 , 1_000 , Float64, Val (4 )) # 1.25
243
+
244
+ estimate_cost_onearg_serial (sin, 512 , 1_000 , Float64, Val (1 )) # 18
245
+ estimate_cost_onearg_serial (sin, 512 , 1_000 , Float64, Val (2 )) # 15
246
+ estimate_cost_onearg_serial (sin, 512 , 1_000 , Float64, Val (4 )) # 15
247
+
248
+ estimate_cost_onearg_serial (cos, 512 , 1_000 , Float64, Val (1 )) # 19
249
+ estimate_cost_onearg_serial (cos, 512 , 1_000 , Float64, Val (2 )) # 16
250
+ estimate_cost_onearg_serial (cos, 512 , 1_000 , Float64, Val (4 )) # 16
251
+
252
+ estimate_cost_onearg_tworet_serial (sincos, 512 , 1_000 , Float64, Val (1 )) # 25
253
+ estimate_cost_onearg_tworet_serial (sincos, 512 , 1_000 , Float64, Val (2 )) # 23
254
+ estimate_cost_onearg_tworet_serial (sincos, 512 , 1_000 , Float64, Val (4 )) # 22
255
+
256
+
257
+ estimate_cost_onearg (SLEEFPirates. exp, 512 , 1_000 , Float64, Val (1 )) # 28 # 21
258
+ estimate_cost_onearg (SLEEFPirates. exp, 512 , 1_000 , Float64, Val (2 )) # 28 # 20
259
+ estimate_cost_onearg (SLEEFPirates. exp, 512 , 1_000 , Float64, Val (4 )) # 28 # 19.5
260
+
261
+ estimate_cost_onearg (SLEEFPirates. log, 512 , 1_000 , Float64, Val (1 )) # 51 cycles # 44
262
+ estimate_cost_onearg (SLEEFPirates. log, 512 , 1_000 , Float64, Val (2 )) # 51 cycles # 40
263
+ estimate_cost_onearg (SLEEFPirates. log, 512 , 1_000 , Float64, Val (4 )) # 51 cycles # 39
264
+
265
+ estimate_cost_onearg (SIMDPirates. vsqrt, 512 , 1_000 , Float64, Val (1 )) # 23 cycles # 20
266
+ estimate_cost_onearg (SIMDPirates. vsqrt, 512 , 1_000 , Float64, Val (2 )) # 23 cycles # 20
267
+ estimate_cost_onearg (SIMDPirates. vsqrt, 512 , 1_000 , Float64, Val (4 )) # 23 cycles # 20
268
+
269
+ estimate_cost_onearg (SIMDPirates. vinv, 512 , 1_000 , Float64, Val (1 )) # 23 cycles # 13.4
270
+ estimate_cost_onearg (SIMDPirates. vinv, 512 , 1_000 , Float64, Val (2 )) # 23 cycles # 13.4
271
+ estimate_cost_onearg (SIMDPirates. vinv, 512 , 1_000 , Float64, Val (4 )) # 23 cycles # 13.4
272
+
273
+ estimate_cost_onearg (SLEEFPirates. sin, 512 , 1_000 , Float64, Val (1 )) # cycles # 68
274
+ estimate_cost_onearg (SLEEFPirates. sin, 512 , 1_000 , Float64, Val (2 )) # cycles # 66
275
+ estimate_cost_onearg (SLEEFPirates. sin, 512 , 1_000 , Float64, Val (4 )) # cycles # 66
276
+
277
+ estimate_cost_onearg (SLEEFPirates. cos, 512 , 1_000 , Float64, Val (1 )) # cycles # 65
278
+ estimate_cost_onearg (SLEEFPirates. cos, 512 , 1_000 , Float64, Val (2 )) # cycles # 68
279
+ estimate_cost_onearg (SLEEFPirates. cos, 512 , 1_000 , Float64, Val (4 )) # cycles # 66
280
+
281
+ estimate_cost_onearg_tworet (SLEEFPirates. sincos, 512 , 1_000 , Float64, Val (1 )) # cycles # 71
282
+ estimate_cost_onearg_tworet (SLEEFPirates. sincos, 512 , 1_000 , Float64, Val (2 )) # cycles # 71
283
+ estimate_cost_onearg_tworet (SLEEFPirates. sincos, 512 , 1_000 , Float64, Val (4 )) # cycles # 68
43
284
44
285
const cz = ntuple (Val (4 )) do i Core. VecElement (randn ()) end
45
286
# @code_native debuginfo=:none
46
- estimate_cost (x -> SIMDPirates. vmul (x,cz), 1 << 9 , 10 ^ 3 , Float64, Val (1 )) # 4.5 cycles
47
- estimate_cost (x -> SIMDPirates. vmul (x,cz), 1 << 9 , 10 ^ 3 , Float64, Val (2 )) # 2 cycles
48
- estimate_cost (x -> SIMDPirates. vmul (x,cz), 1 << 9 , 10 ^ 3 , Float64, Val (4 )) # 1 cycles
287
+ estimate_cost_onearg (x -> SIMDPirates. vmul (x,cz), 1 << 9 , 10 ^ 3 , Float64, Val (1 )) # 4.5 cycles # 3.35
288
+ estimate_cost_onearg (x -> SIMDPirates. vmul (x,cz), 1 << 9 , 10 ^ 3 , Float64, Val (2 )) # 2 cycles # 1.66
289
+ estimate_cost_onearg (x -> SIMDPirates. vmul (x,cz), 1 << 9 , 10 ^ 3 , Float64, Val (4 )) # 1 cycles # 1
290
+ estimate_cost_onearg (x -> SIMDPirates. vmul (x,cz), 1 << 9 , 10 ^ 3 , Float64, Val (8 )) # cycles # 0.65
291
+
292
+ estimate_cost_twoarg (SIMDPirates. vmul, 1 << 9 , 10 ^ 3 , Float64, Val (1 )) # cycles # 3.3
293
+ estimate_cost_twoarg (SIMDPirates. vmul, 1 << 9 , 10 ^ 3 , Float64, Val (2 )) # cycles # 0.97
294
+ estimate_cost_twoarg (SIMDPirates. vmul, 1 << 9 , 10 ^ 3 , Float64, Val (4 )) # cycles # 0.52
295
+ estimate_cost_twoarg (SIMDPirates. vmul, 1 << 9 , 10 ^ 3 , Float64, Val (8 )) # cycles # 0.51
296
+ estimate_cost_twoarg (SIMDPirates. evmul, 1 << 9 , 10 ^ 3 , Float64, Val (8 )) # cycles # 0.51
297
+ estimate_cost_twoarg (SIMDPirates. vadd, 1 << 9 , 10 ^ 3 , Float64, Val (1 )) # cycles # 3.3
298
+ estimate_cost_twoarg (SIMDPirates. vadd, 1 << 9 , 10 ^ 3 , Float64, Val (2 )) # cycles # 0.97
299
+ estimate_cost_twoarg (SIMDPirates. vadd, 1 << 9 , 10 ^ 3 , Float64, Val (4 )) # cycles # 0.52
300
+ estimate_cost_twoarg (SIMDPirates. vadd, 1 << 9 , 10 ^ 3 , Float64, Val (8 )) # cycles # 0.5
301
+ estimate_cost_twoarg (SIMDPirates. evadd, 1 << 9 , 10 ^ 3 , Float64, Val (8 )) # cycles # 0.5
302
+
303
+ @code_native debuginfo= :none estimate_cost_twoarg (SIMDPirates. vmul, 1 << 9 , 10 ^ 3 , Float64, Val (8 )) # cycles # 0.64
304
+ @code_native debuginfo= :none estimate_cost_twoarg (SIMDPirates. evmul, 1 << 9 , 10 ^ 3 , Float64, Val (8 )) # cycles # 0.64
305
+
306
+
307
+ estimate_cost_threearg (SIMDPirates. vmuladd, 1 << 9 , 10 ^ 3 , Float64, Val (1 )) # cycles # 3.3
308
+ estimate_cost_threearg (SIMDPirates. vmuladd, 1 << 9 , 10 ^ 3 , Float64, Val (2 )) # cycles # 0.99
309
+ estimate_cost_threearg (SIMDPirates. vmuladd, 1 << 9 , 10 ^ 3 , Float64, Val (4 )) # cycles # 0.54
310
+ estimate_cost_threearg (SIMDPirates. vmuladd, 1 << 9 , 10 ^ 3 , Float64, Val (8 )) # cycles # 0.533
311
+ @code_native debuginfo= :none estimate_cost_threearg (SIMDPirates. vmuladd, 1 << 9 , 10 ^ 3 , Float64, Val (8 )) # cycles # 0.537
312
+ @code_native debuginfo= :none estimate_cost_threearg (SIMDPirates. vfmadd, 1 << 9 , 10 ^ 3 , Float64, Val (8 )) # cycles # 0.85
49
313
50
314
@testset " LoopVectorization.jl" begin
51
315
# Write your own tests here.
0 commit comments