Skip to content

Commit 758a885

Browse files
committed
Make COST keys Instructions again, so that other libraries may extend them. Add special handling of zeros, emitting vzeros.
1 parent a7a247a commit 758a885

File tree

5 files changed

+145
-104
lines changed

5 files changed

+145
-104
lines changed

src/LoopVectorization.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@ module LoopVectorization
33
using VectorizationBase, SIMDPirates, SLEEFPirates, MacroTools, Parameters
44
using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, extract_data, num_vector_load_expr,
55
mask, masktable, pick_vector_width_val, valmul, valrem, valmuladd, valadd, valsub, _MM,
6-
maybestaticlength, maybestaticsize, staticm1, subsetview,
6+
maybestaticlength, maybestaticsize, staticm1, subsetview, vzero,
77
Static, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange,
88
PackedStridedPointer, SparseStridedPointer, RowMajorStridedPointer, StaticStridedPointer, StaticStridedStruct
9-
using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod, reduce_to_add, reduce_to_prod
9+
using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod, reduce_to_add, reduce_to_prod,
10+
vmullog2, vmullog10, vdivlog2, vdivlog2add, vdivlog10, vdivlog10add, vfmaddaddone
1011
using Base.Broadcast: Broadcasted, DefaultArrayStyle
1112
using LinearAlgebra: Adjoint, Transpose
1213
using MacroTools: prewalk, postwalk

src/costs.jl

Lines changed: 73 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,10 @@ function vector_cost(ic::InstructionCost, Wshift, sizeof_T)
6464
end
6565
srt, sl, srp
6666
end
67-
instruction_cost(instruction::Symbol) = get(COST, instruction, OPAQUE_INSTRUCTION)
68-
instruction_cost(instruction::Instruction) = instruction_cost(instruction.instr)
67+
# instruction_cost(instruction::Symbol) = get(COST, instruction, OPAQUE_INSTRUCTION)
68+
# instruction_cost(instruction::Instruction) = instruction_cost(instruction.instr)
69+
instruction_cost(instruction::Instruction) = get(COST, instruction, OPAQUE_INSTRUCTION)
70+
instruction_cost(instruction::Symbol) = instruction_cost(Instruction(instruction))
6971
scalar_cost(instr::Instruction) = scalar_cost(instruction_cost(instr))
7072
vector_cost(instr::Instruction, Wshift, sizeof_T) = vector_cost(instruction_cost(instr), Wshift, sizeof_T)
7173
function cost(instruction::InstructionCost, Wshift, sizeof_T)
@@ -87,76 +89,78 @@ const OPAQUE_INSTRUCTION = InstructionCost(50, 50.0, -1.0, VectorizationBase.REG
8789
# as a heuristic means of approximating register pressure, since many loads can be
8890
# consolidated into a single register. The number of LICM-ed setindex!, on the other
8991
# hand, should indicate how many registers we're keeping live for the sake of eventually storing.
90-
const COST = Dict{Symbol,InstructionCost}(
91-
:getindex => InstructionCost(-3.0,0.5,3,1),
92-
:setindex! => InstructionCost(-3.0,1.0,3,0),
93-
:conditionalstore! => InstructionCost(-3.0,1.0,3,0),
94-
:zero => InstructionCost(1,0.5),
95-
:one => InstructionCost(3,0.5),
96-
:(+) => InstructionCost(4,0.5),
97-
:(-) => InstructionCost(4,0.5),
98-
:(*) => InstructionCost(4,0.5),
99-
:(/) => InstructionCost(13,4.0,-2.0),
100-
:vadd => InstructionCost(4,0.5),
101-
:vsub => InstructionCost(4,0.5),
102-
:vmul => InstructionCost(4,0.5),
103-
:vfdiv => InstructionCost(13,4.0,-2.0),
104-
:evadd => InstructionCost(4,0.5),
105-
:evsub => InstructionCost(4,0.5),
106-
:evmul => InstructionCost(4,0.5),
107-
:evfdiv => InstructionCost(13,4.0,-2.0),
108-
:reduced_add => InstructionCost(4,0.5),# ignoring reduction part of cost, might be nop
109-
:reduced_prod => InstructionCost(4,0.5),# ignoring reduction part of cost, might be nop
110-
:reduce_to_add => InstructionCost(0,0.0,0.0,0),
111-
:reduce_to_prod => InstructionCost(0,0.0,0.0,0),
112-
:abs2 => InstructionCost(4,0.5),
113-
:vabs2 => InstructionCost(4,0.5),
114-
:(==) => InstructionCost(1, 0.5),
115-
:isequal => InstructionCost(1, 0.5),
116-
:(~) => InstructionCost(1, 0.5),
117-
:(&) => InstructionCost(1, 0.5),
118-
:(|) => InstructionCost(1, 0.5),
119-
:(>) => InstructionCost(1, 0.5),
120-
:(<) => InstructionCost(1, 0.5),
121-
:(>=) => InstructionCost(1, 0.5),
122-
:(<=) => InstructionCost(1, 0.5),
123-
:ifelse => InstructionCost(1, 0.5),
124-
:vifelse => InstructionCost(1, 0.5),
125-
:inv => InstructionCost(13,4.0,-2.0,1),
126-
:vinv => InstructionCost(13,4.0,-2.0,1),
127-
:muladd => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
128-
:fma => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
129-
:vmuladd => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
130-
:vfma => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
131-
:vfmadd => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
132-
:vfmsub => InstructionCost(4,0.5), # - and * will fuse into this, so much of the time they're not twice as expensive
133-
:vfnmadd => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
134-
:vfnmsub => InstructionCost(4,0.5), # - and -* will fuse into this, so much of the time they're not twice as expensive
135-
:vfmadd_fast => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
136-
:vfmsub_fast => InstructionCost(4,0.5), # - and * will fuse into this, so much of the time they're not twice as expensive
137-
:vfnmadd_fast => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
138-
:vfnmsub_fast => InstructionCost(4,0.5), # - and -* will fuse into this, so much of the time they're not twice as expensive
139-
:sqrt => InstructionCost(15,4.0,-2.0),
140-
:sqrt_fast => InstructionCost(15,4.0,-2.0),
141-
:log => InstructionCost(20,20.0,40.0,20),
142-
:exp => InstructionCost(20,20.0,20.0,18),
143-
:^ => InstructionCost(40,40.0,40.0,26), # FIXME
144-
:sin => InstructionCost(18,15.0,68.0,23),
145-
:cos => InstructionCost(18,15.0,68.0,26),
146-
:sincos => InstructionCost(25,22.0,70.0,26),
147-
:log_fast => InstructionCost(20,20.0,40.0,20),
148-
:exp_fast => InstructionCost(20,20.0,20.0,18),
149-
:sin_fast => InstructionCost(18,15.0,68.0,23),
150-
:cos_fast => InstructionCost(18,15.0,68.0,26),
151-
:sincos_fast => InstructionCost(25,22.0,70.0,26),
152-
:identity => InstructionCost(0,0.0,0.0,0),
153-
:adjoint => InstructionCost(0,0.0,0.0,0),
154-
:transpose => InstructionCost(0,0.0,0.0,0),
92+
const COST = Dict{Instruction,InstructionCost}(
93+
Instruction(:getindex) => InstructionCost(-3.0,0.5,3,1),
94+
Instruction(:setindex!) => InstructionCost(-3.0,1.0,3,0),
95+
Instruction(:conditionalstore!) => InstructionCost(-3.0,1.0,3,0),
96+
Instruction(:zero) => InstructionCost(1,0.5),
97+
Instruction(:one) => InstructionCost(3,0.5),
98+
Instruction(:(+)) => InstructionCost(4,0.5),
99+
Instruction(:(-)) => InstructionCost(4,0.5),
100+
Instruction(:(*)) => InstructionCost(4,0.5),
101+
Instruction(:(/)) => InstructionCost(13,4.0,-2.0),
102+
Instruction(:vadd) => InstructionCost(4,0.5),
103+
Instruction(:vsub) => InstructionCost(4,0.5),
104+
Instruction(:vmul) => InstructionCost(4,0.5),
105+
Instruction(:vfdiv) => InstructionCost(13,4.0,-2.0),
106+
Instruction(:evadd) => InstructionCost(4,0.5),
107+
Instruction(:evsub) => InstructionCost(4,0.5),
108+
Instruction(:evmul) => InstructionCost(4,0.5),
109+
Instruction(:evfdiv) => InstructionCost(13,4.0,-2.0),
110+
Instruction(:reduced_add) => InstructionCost(4,0.5),# ignoring reduction part of cost, might be nop
111+
Instruction(:reduced_prod) => InstructionCost(4,0.5),# ignoring reduction part of cost, might be nop
112+
Instruction(:reduce_to_add) => InstructionCost(0,0.0,0.0,0),
113+
Instruction(:reduce_to_prod) => InstructionCost(0,0.0,0.0,0),
114+
Instruction(:abs2) => InstructionCost(4,0.5),
115+
Instruction(:vabs2) => InstructionCost(4,0.5),
116+
Instruction(:(==)) => InstructionCost(1, 0.5),
117+
Instruction(:isequal) => InstructionCost(1, 0.5),
118+
Instruction(:(~)) => InstructionCost(1, 0.5),
119+
Instruction(:(&)) => InstructionCost(1, 0.5),
120+
Instruction(:(|)) => InstructionCost(1, 0.5),
121+
Instruction(:(>)) => InstructionCost(1, 0.5),
122+
Instruction(:(<)) => InstructionCost(1, 0.5),
123+
Instruction(:(>=)) => InstructionCost(1, 0.5),
124+
Instruction(:(<=)) => InstructionCost(1, 0.5),
125+
Instruction(:ifelse) => InstructionCost(1, 0.5),
126+
Instruction(:vifelse) => InstructionCost(1, 0.5),
127+
Instruction(:inv) => InstructionCost(13,4.0,-2.0,1),
128+
Instruction(:vinv) => InstructionCost(13,4.0,-2.0,1),
129+
Instruction(:muladd) => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
130+
Instruction(:fma) => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
131+
Instruction(:vmuladd) => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
132+
Instruction(:vfma) => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
133+
Instruction(:vfmadd) => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
134+
Instruction(:vfmsub) => InstructionCost(4,0.5), # - and * will fuse into this, so much of the time they're not twice as expensive
135+
Instruction(:vfnmadd) => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
136+
Instruction(:vfnmsub) => InstructionCost(4,0.5), # - and -* will fuse into this, so much of the time they're not twice as expensive
137+
Instruction(:vfmadd_fast) => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
138+
Instruction(:vfmsub_fast) => InstructionCost(4,0.5), # - and * will fuse into this, so much of the time they're not twice as expensive
139+
Instruction(:vfnmadd_fast) => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
140+
Instruction(:vfnmsub_fast) => InstructionCost(4,0.5), # - and -* will fuse into this, so much of the time they're not twice as expensive
141+
Instruction(:sqrt) => InstructionCost(15,4.0,-2.0),
142+
Instruction(:sqrt_fast) => InstructionCost(15,4.0,-2.0),
143+
Instruction(:log) => InstructionCost(20,20.0,40.0,20),
144+
Instruction(:exp) => InstructionCost(20,20.0,20.0,18),
145+
Instruction(:(^)) => InstructionCost(40,40.0,40.0,26), # FIXME
146+
Instruction(:sin) => InstructionCost(18,15.0,68.0,23),
147+
Instruction(:cos) => InstructionCost(18,15.0,68.0,26),
148+
Instruction(:sincos) => InstructionCost(25,22.0,70.0,26),
149+
Instruction(:log_fast) => InstructionCost(20,20.0,40.0,20),
150+
Instruction(:exp_fast) => InstructionCost(20,20.0,20.0,18),
151+
Instruction(:sin_fast) => InstructionCost(18,15.0,68.0,23),
152+
Instruction(:cos_fast) => InstructionCost(18,15.0,68.0,26),
153+
Instruction(:sincos_fast) => InstructionCost(25,22.0,70.0,26),
154+
Instruction(:identity) => InstructionCost(0,0.0,0.0,0),
155+
Instruction(:adjoint) => InstructionCost(0,0.0,0.0,0),
156+
Instruction(:transpose) => InstructionCost(0,0.0,0.0,0),
155157
# Symbol("##CONSTANT##") => InstructionCost(0,0.0)
156158
)
157159

158-
const KNOWNINSTRUCTIONS = keys(COST)
159-
instruction(f, m) = f KNOWNINSTRUCTIONS ? Instruction(:LoopVectorization, f) : Instruction(m, f)
160+
# const KNOWNINSTRUCTIONS = keys(COST)
161+
# instruction(f, m) = f ∈ KNOWNINSTRUCTIONS ? Instruction(:LoopVectorization, f) : Instruction(m, f)
162+
instruction(f::Symbol, m) = Instruction(f) keys(COST) ? Instruction(f) : Instruction(m, f)
163+
# instruction(f, m) = get(COST, f, Instruction(m, f))
160164

161165
# for (k, v) ∈ COST # so we can look up Symbol(typeof(function))
162166
# COST[Symbol("typeof(", lower(k), ")")] = v

src/lower_constant.jl

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,30 @@
1-
1+
function lower_zero!(
2+
q::Expr, op::Operation, vectorized::Symbol, W::Symbol, unrolled::Symbol, U::Int,
3+
suffix::Union{Nothing,Int}, typeT::Symbol
4+
)
5+
mvar = variable_name(op, suffix)
6+
if vectorized loopdependencies(op) || vectorized reducedchildren(op) || vectorized reduceddependencies(op)
7+
call = Expr(:call, lv(:vzero), W, typeT)
8+
else
9+
call = Expr(:call, :zero, typeT)
10+
end
11+
if unrolled loopdependencies(op) || unrolled reducedchildren(op) || unrolled reduceddependencies(op)
12+
for u 0:U-1
13+
push!(q.args, Expr(:(=), Symbol(mvar, u), call))
14+
end
15+
else
16+
push!(q.args, Expr(:(=), mvar, call))
17+
end
18+
nothing
19+
end
220
function lower_constant!(
321
q::Expr, op::Operation, vectorized::Symbol, W::Symbol, unrolled::Symbol, U::Int,
4-
suffix::Union{Nothing,Int}, mask::Any = nothing
22+
suffix::Union{Nothing,Int}
523
)
624
instruction = op.instruction
725
mvar = variable_name(op, suffix)
826
constsym = instruction.instr
9-
# constsym = mangledvar(op)
1027
if vectorized loopdependencies(op) || vectorized reducedchildren(op) || vectorized reduceddependencies(op)
11-
# call = Expr(:call, lv(:vbroadcast), W, mangledvar(op))
1228
call = Expr(:call, lv(:vbroadcast), W, constsym)
1329
if unrolled loopdependencies(op) || unrolled reducedchildren(op) || unrolled reduceddependencies(op)
1430
for u 0:U-1
@@ -29,6 +45,21 @@ function lower_constant!(
2945
nothing
3046
end
3147

48+
function setop!(ls, op, val)
49+
if instruction(op) === LOOPCONSTANT# && mangledvar(op) !== val
50+
pushpreamble!(ls, Expr(:(=), mangledvar(op), val))
51+
else
52+
pushpreamble!(ls, Expr(:(=), instruction(op).instr, val))
53+
end
54+
nothing
55+
end
56+
function setconstantop!(ls, op, val)
57+
if instruction(op) === LOOPCONSTANT# && mangledvar(op) !== val
58+
pushpreamble!(ls, Expr(:(=), mangledvar(op), val))
59+
end
60+
nothing
61+
end
62+
3263

3364
function lower_licm_constants!(ls::LoopSet)
3465
ops = operations(ls)
@@ -42,7 +73,7 @@ function lower_licm_constants!(ls::LoopSet)
4273
setop!(ls, ops[id], Expr(:call, lv(:sizeequivalentfloat), ls.T, intval))
4374
end
4475
for id ls.preamble_zeros
45-
setop!(ls, ops[id], Expr(:call, :zero, ls.T))
76+
setconstantop!(ls, ops[id], Expr(:call, :zero, ls.T))
4677
end
4778
for id ls.preamble_ones
4879
setop!(ls, ops[id], Expr(:call, :one, ls.T))

src/lowering.jl

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,16 @@
88
# end
99

1010
function lower!(
11-
q::Expr, op::Operation, vectorized::Symbol, W::Symbol, unrolled::Symbol, tiled::Symbol, U::Int,
12-
suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned} = nothing
11+
q::Expr, op::Operation, vectorized::Symbol, ls::LoopSet, unrolled::Symbol, tiled::Symbol, U::Int,
12+
suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned}
1313
)
14+
W = ls.W
1415
if isconstant(op)
15-
lower_constant!(q, op, vectorized, W, unrolled, U, suffix, mask)
16+
if identifier(op) ls.preamble_zeros
17+
lower_zero!(q, op, vectorized, W, unrolled, U, suffix, ls.T)
18+
else
19+
lower_constant!(q, op, vectorized, W, unrolled, U, suffix)
20+
end
1621
elseif isload(op)
1722
lower_load!(q, op, vectorized, W, unrolled, tiled, U, suffix, mask)
1823
elseif iscompute(op)
@@ -22,10 +27,10 @@ function lower!(
2227
end
2328
end
2429
function lower!(
25-
q::Expr, ops::AbstractVector{Operation}, vectorized::Symbol, W::Symbol, unrolled::Symbol, tiled::Symbol, U::Int,
26-
suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned} = nothing
30+
q::Expr, ops::AbstractVector{Operation}, vectorized::Symbol, ls::LoopSet, unrolled::Symbol, tiled::Symbol, U::Int,
31+
suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned}
2732
)
28-
foreach(op -> lower!(q, op, vectorized, W, unrolled, tiled, U, suffix, mask), ops)
33+
foreach(op -> lower!(q, op, vectorized, ls, unrolled, tiled, U, suffix, mask), ops)
2934
end
3035

3136
tiledsym(s::Symbol) = Symbol("##outer##", s, "##outer##")
@@ -70,9 +75,9 @@ function lower_nest(
7075
end
7176
for prepost 1:2
7277
# !U && !T
73-
lower!(blockq, ops[1,1,prepost,n], vectorized, W, unrolled, last(order), U, nothing, mask)
78+
lower!(blockq, ops[1,1,prepost,n], vectorized, ls, unrolled, last(order), U, nothing, mask)
7479
# for u ∈ 0:U-1 # U && !T
75-
lower!(blockq, ops[2,1,prepost,n], vectorized, W, unrolled, last(order), U, nothing, mask)
80+
lower!(blockq, ops[2,1,prepost,n], vectorized, ls, unrolled, last(order), U, nothing, mask)
7681
# end
7782
if length(ops[1,2,prepost,n]) + length(ops[2,2,prepost,n]) > 0
7883
for t 0:T-1
@@ -82,9 +87,9 @@ function lower_nest(
8287
push!(blockq.args, Expr(:+=, last(order), 1))
8388
end
8489
# !U && T
85-
lower!(blockq, ops[1,2,prepost,n], vectorized, W, unrolled, last(order), U, t, mask)
90+
lower!(blockq, ops[1,2,prepost,n], vectorized, ls, unrolled, last(order), U, t, mask)
8691
# for u ∈ 0:U-1 # U && T
87-
lower!(blockq, ops[2,2,prepost,n], vectorized, W, unrolled, last(order), U, t, mask)
92+
lower!(blockq, ops[2,2,prepost,n], vectorized, ls, unrolled, last(order), U, t, mask)
8893
# end
8994
end
9095
end
@@ -146,9 +151,16 @@ end
146151
function initialize_outer_reductions!(
147152
q::Expr, op::Operation, Umin::Int, Umax::Int, W::Symbol, typeT::Symbol, vectorized::Symbol, suffix::Union{Symbol,Nothing} = nothing
148153
)
149-
z = Expr(:call, reduction_zero(op.instruction), typeT)
150-
if vectorized reduceddependencies(op)
151-
z = Expr(:call, lv(:vbroadcast), W, z)
154+
reduct_zero = reduction_zero(op.instruction)
155+
isvectorized = vectorized reduceddependencies(op)
156+
z = if isvectorized
157+
if reduct_zero === :zero
158+
Expr(:call, lv(:vzero), W, typeT)
159+
else
160+
Expr(:call, lv(:vbroadcast), W, Expr(:call, reduct_zero, typeT))
161+
end
162+
else
163+
Expr(:call, reduct_zero, typeT)
152164
end
153165
mvar = variable_name(op, suffix)
154166
for u Umin:Umax-1
@@ -362,20 +374,6 @@ end
362374
@inline sizeequivalentint(::Type{Float16}, x::Int64) = Int16(x)
363375
@inline sizeequivalentint(::Type{Float16}, x::Int32) = Int16(x)
364376

365-
function setop!(ls, op, val)
366-
if instruction(op) === LOOPCONSTANT# && mangledvar(op) !== val
367-
pushpreamble!(ls, Expr(:(=), mangledvar(op), val))
368-
else
369-
pushpreamble!(ls, Expr(:(=), instruction(op).instr, val))
370-
end
371-
nothing
372-
end
373-
function setconstantop!(ls, op, val)
374-
if instruction(op) === LOOPCONSTANT# && mangledvar(op) !== val
375-
pushpreamble!(ls, Expr(:(=), mangledvar(op), val))
376-
end
377-
nothing
378-
end
379377

380378
function setup_preamble!(ls::LoopSet, W::Symbol, typeT::Symbol, vectorized::Symbol, unrolled::Symbol, tiled::Symbol, U::Int)
381379
# println("Setup preamble")

test/runtests.jl

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
using Test
22
using LoopVectorization
33
using LinearAlgebra
4-
T = Float32
4+
# T = Float32
55

66

77
function clenshaw(x,coeff)
@@ -1188,6 +1188,11 @@ end
11881188
ret[j] = clenshaw(x[j], coeff)
11891189
end
11901190
end
1191+
function clenshawavx!(ret,x,coeff)
1192+
@avx for j in 1:length(ret)
1193+
ret[j] = clenshaw(x[j], coeff)
1194+
end
1195+
end
11911196

11921197
function softmax3_core!(lse, qq, xx, tmpmax, maxk, nk)
11931198
for k in Base.OneTo(maxk)
@@ -1443,6 +1448,8 @@ end
14431448
clenshaw!(y1,x,c)
14441449
clenshaw_avx!(y2,x,c)
14451450
@test y1 y2
1451+
clenshawavx!(y2,x,c)
1452+
@test y1 y2
14461453

14471454

14481455
ni, nj, nk = (100, 100, 10)

0 commit comments

Comments
 (0)