@@ -224,6 +224,100 @@ end
224
224
(ts_end - ts_start) / (2 N* K), vsum (s_1)
225
225
end
226
226
end
227
+
228
+
229
+
230
+ @generated function estimate_cost_onearg_llvmunroll (f:: F , N:: Int = 512 , K = 1_000 , :: Type{T} = Float64, :: Val{U} = Val (4 )) where {F,T,U}
231
+ W, Wshift = VectorizationBase. pick_vector_width_shift (T)
232
+ Ushift = VectorizationBase. intlog2 (U)
233
+ W <<= Ushift
234
+ quote
235
+ s = vbroadcast (Vec{$ W,$ T}, zero (T))
236
+ x = rand (T, N << $ Wshift)
237
+ ptrx = pointer (x)
238
+ ts_start, id_start = cpucycle_id ()
239
+ for k ∈ 1 : K
240
+ _ptrx = ptrx
241
+ for n ∈ 1 : N>> $ Ushift
242
+ v = vload (Vec{$ W,$ T}, _ptrx)
243
+ s = vadd (s, f (v))
244
+ _ptrx += VectorizationBase. REGISTER_SIZE* $ U
245
+ end
246
+ end
247
+ ts_end, id_end = cpucycle_id ()
248
+ @assert id_start == id_end
249
+ (ts_end - ts_start) / (N* K), vsum (s)
250
+ end
251
+ end
252
+ @generated function estimate_cost_onearg_tworet_llvmunroll (f:: F , N:: Int = 512 , K = 1_000 , :: Type{T} = Float64, :: Val{U} = Val (4 )) where {F,T,U}
253
+ W, Wshift = VectorizationBase. pick_vector_width_shift (T)
254
+ Ushift = VectorizationBase. intlog2 (U)
255
+ W <<= Ushift
256
+ quote
257
+ s = vbroadcast (Vec{$ W,$ T}, zero (T))
258
+ x = rand (T, N << $ Wshift)
259
+ ptrx = pointer (x)
260
+ ts_start, id_start = cpucycle_id ()
261
+ for k ∈ 1 : K
262
+ _ptrx = ptrx
263
+ for n ∈ 1 : N>> $ Ushift
264
+ v = vload (Vec{$ W,$ T}, _ptrx)
265
+ a, b = f (v)
266
+ s = vmuladd (a, b, s)
267
+ _ptrx += VectorizationBase. REGISTER_SIZE* $ U
268
+ end
269
+ end
270
+ ts_end, id_end = cpucycle_id ()
271
+ @assert id_start == id_end
272
+ (ts_end - ts_start) / (N* K), vsum (s)
273
+ end
274
+ end
275
+ @generated function estimate_cost_twoarg_llvmunroll (f:: F , N:: Int = 512 , K = 1_000 , :: Type{T} = Float64, :: Val{U} = Val (4 )) where {F,T,U}
276
+ W, Wshift = VectorizationBase. pick_vector_width_shift (T)
277
+ Ushift = VectorizationBase. intlog2 (U)
278
+ W <<= Ushift
279
+ quote
280
+ s = vbroadcast (Vec{$ W,$ T}, one (T))
281
+ x = rand (T, N << $ Wshift)
282
+ ptrx = pointer (x)
283
+ ts_start, id_start = cpucycle_id ()
284
+ for k ∈ 1 : K
285
+ _ptrx = ptrx
286
+ for n ∈ 1 : N>> $ Ushift
287
+ v = vload (Vec{$ W,$ T}, _ptrx)
288
+ s = f (v, s)
289
+ _ptrx += VectorizationBase. REGISTER_SIZE* $ U
290
+ end
291
+ end
292
+ ts_end, id_end = cpucycle_id ()
293
+ @assert id_start == id_end
294
+ (ts_end - ts_start) / (N* K), vsum (s)
295
+ end
296
+ end
297
+ @generated function estimate_cost_threearg_llvmunroll (f:: F , N:: Int = 512 , K = 1_000 , :: Type{T} = Float64, :: Val{U} = Val (4 )) where {F,T,U}
298
+ W, Wshift = VectorizationBase. pick_vector_width_shift (T)
299
+ Ushift = VectorizationBase. intlog2 (U)
300
+ W <<= Ushift
301
+ quote
302
+ s = vbroadcast (Vec{$ W,$ T}, zero (T))
303
+ x = rand (T, N << $ Wshift)
304
+ ptrx = pointer (x)
305
+ ts_start, id_start = cpucycle_id ()
306
+ for k ∈ 1 : K
307
+ _ptrx = ptrx
308
+ for n ∈ 1 : N>> $ Ushift
309
+ v = vload (Vec{$ W,$ T}, _ptrx)
310
+ s = f (v, v, s)
311
+ _ptrx += VectorizationBase. REGISTER_SIZE* $ U
312
+ end
313
+ end
314
+ ts_end, id_end = cpucycle_id ()
315
+ @assert id_start == id_end
316
+ (ts_end - ts_start) / (N* K), vsum (s)
317
+ end
318
+ end
319
+
320
+
227
321
estimate_cost_onearg_serial (exp, 512 , 1_000 , Float64, Val (1 )) # 21
228
322
estimate_cost_onearg_serial (exp, 512 , 1_000 , Float64, Val (2 )) # 18.4
229
323
estimate_cost_onearg_serial (exp, 512 , 1_000 , Float64, Val (4 )) # 17.5
@@ -254,10 +348,15 @@ estimate_cost_onearg_tworet_serial(sincos, 512, 1_000, Float64, Val(2)) # 23
254
348
estimate_cost_onearg_tworet_serial (sincos, 512 , 1_000 , Float64, Val (4 )) # 22
255
349
256
350
257
- estimate_cost_onearg (SLEEFPirates. exp, 512 , 1_000 , Float64, Val (1 )) # 28 # 21
258
- estimate_cost_onearg (SLEEFPirates. exp, 512 , 1_000 , Float64, Val (2 )) # 28 # 20
351
+ estimate_cost_onearg (SLEEFPirates. exp, 512 , 1_000 , Float64, Val (1 )) # 48 # 21
352
+ estimate_cost_onearg (SLEEFPirates. exp, 512 , 1_000 , Float64, Val (2 )) # 52 # 20
259
353
estimate_cost_onearg (SLEEFPirates. exp, 512 , 1_000 , Float64, Val (4 )) # 28 # 19.5
260
354
355
+ estimate_cost_onearg_llvmunroll (SLEEFPirates. exp, 512 , 1_000 , Float64, Val (1 )) # 50 # 21
356
+ estimate_cost_onearg_llvmunroll (SLEEFPirates. exp, 512 , 1_000 , Float64, Val (2 )) # 40 # 20
357
+ # estimate_cost_onearg_llvmunroll(SLEEFPirates.exp, 512, 1_000, Float64, Val(3)) # 40 #
358
+ estimate_cost_onearg_llvmunroll (SLEEFPirates. exp, 512 , 1_000 , Float64, Val (4 )) # 32 # 19.5
359
+
261
360
estimate_cost_onearg (SLEEFPirates. log, 512 , 1_000 , Float64, Val (1 )) # 51 cycles # 44
262
361
estimate_cost_onearg (SLEEFPirates. log, 512 , 1_000 , Float64, Val (2 )) # 51 cycles # 40
263
362
estimate_cost_onearg (SLEEFPirates. log, 512 , 1_000 , Float64, Val (4 )) # 51 cycles # 39
0 commit comments