Skip to content

Commit 330d0b6

Browse files
committed
Fix poor codegen in handling reductions in precense of reduction zeros (poor codegen LLVM was forced to clean up).
1 parent 7bf98b4 commit 330d0b6

File tree

2 files changed

+95
-4
lines changed

2 files changed

+95
-4
lines changed

src/add_compute.jl

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ function add_parent!(
5757
add_operation!(ls, gensym(:temporary), var, elementbytes, position)
5858
else # assumed constant
5959
add_constant!(ls, var, elementbytes)
60+
# add_constant!(ls, var, deps, gensym(:loopredefconst), elementbytes)
6061
end
6162
pushparent!(parents, deps, reduceddeps, parent)
6263
end
@@ -90,6 +91,16 @@ function add_compute!(ls::LoopSet, op::Operation)
9091
@assert iscompute(op)
9192
pushop!(ls, child, name(op))
9293
end
94+
function isreductzero(op::Operation, ls::LoopSet, reduct_zero::Symbol)
95+
isconstant(op) || return false
96+
reduct_zero === op.instruction.mod && return true
97+
if reduct_zero === :zero
98+
identifier(op) ls.preamble_zeros && return true
99+
elseif reduct_zero === :one
100+
identifier(op) ls.preamble_ones && return true
101+
end
102+
false
103+
end
93104

94105
function add_reduction_update_parent!(
95106
vparents::Vector{Operation}, deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, ls::LoopSet,
@@ -118,7 +129,7 @@ function add_reduction_update_parent!(
118129
end
119130
pushpreamble!(ls, op, name, reductinit)
120131
end
121-
if isconstant(parent) && reduct_zero === parent.instruction.mod #we can use parent op as initialization.
132+
if isreductzero(parent, ls, reduct_zero)
122133
reductcombine = reduction_combine_to(instrclass)
123134
end
124135
else
@@ -128,7 +139,8 @@ function add_reduction_update_parent!(
128139
end
129140
combineddeps = copy(deps); mergesetv!(combineddeps, reduceddeps)
130141
directdependency && pushparent!(vparents, deps, reduceddeps, reductinit)#parent) # deps and reduced deps will not be disjoint
131-
update_reduction_status!(vparents, combineddeps, name(reductinit))
142+
# update_reduction_status!(vparents, combineddeps, name(reductinit))
143+
update_reduction_status!(vparents, reduceddeps, name(reductinit))
132144
# this is the op added by add_compute
133145
op = Operation(length(operations(ls)), reductsym, elementbytes, instr, compute, deps, reduceddeps, vparents)
134146
parent.instruction === LOOPCONSTANT && push!(ls.outer_reductions, identifier(op))

test/runtests.jl

Lines changed: 81 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,17 @@ end
1818

1919

2020
@time @testset "LoopVectorization.jl" begin
21-
22-
21+
selfdotq = :(for i eachindex(a)
22+
s += a[i]*a[i]
23+
end)
24+
lsselfdot = LoopVectorization.LoopSet(selfdotq);
25+
io = IOBuffer();
26+
println(io, LoopVectorization.operations(lsselfdot))
27+
s = String(take!(io))
28+
@test occursin("Operation[var\"", s)
29+
@test occursin("s = 0", s)
30+
@test occursin("s = LoopVectorization.vfmadd", s)
31+
2332
@time @testset "dot" begin
2433
dotq = :(for i eachindex(a,b)
2534
s += a[i]*b[i]
@@ -1268,6 +1277,35 @@ end
12681277
end
12691278
end
12701279

1280+
function AtmulBpos!(C, A, B)
1281+
@inbounds for n 1:size(C,2), m 1:size(C,1)
1282+
Cₘₙ = zero(eltype(C))
1283+
@simd ivdep for k 1:size(A,1)
1284+
Cₘₙ += A[k,m] * B[k,n]
1285+
end
1286+
C[m,n] > 0 && (C[m,n] = Cₘₙ)
1287+
end
1288+
end
1289+
function AtmulBposavx!(C, A, B)
1290+
@avx for n 1:size(C,2), m 1:size(C,1)
1291+
Cₘₙ = zero(eltype(C))
1292+
for k 1:size(A,1)
1293+
Cₘₙ += A[k,m] * B[k,n]
1294+
end
1295+
C[m,n] > 0 && (C[m,n] = Cₘₙ)
1296+
end
1297+
end
1298+
function AtmulBpos_avx!(C, A, B)
1299+
@_avx for n 1:size(C,2), m 1:size(C,1)
1300+
Cₘₙ = zero(eltype(C))
1301+
for k 1:size(A,1)
1302+
Cₘₙ += A[k,m] * B[k,n]
1303+
end
1304+
C[m,n] > 0 && (C[m,n] = Cₘₙ)
1305+
end
1306+
end
1307+
1308+
12711309
N = 117
12721310
for T (Float32, Float64, Int32, Int64)
12731311
if T <: Integer
@@ -1293,6 +1331,23 @@ end
12931331
@test c1 c2
12941332
fill!(c2, -999999999); maybewriteoravx!(c2, a, b)
12951333
@test c1 c2
1334+
1335+
M, K, N = 83, 85, 79;
1336+
if T <: Integer
1337+
A = rand(T(-100):T(100), K, M);
1338+
B = rand(T(-100):T(100), K, N);
1339+
C1 = rand(T(-100):T(100), M, N);
1340+
else
1341+
A = randn(T, K, M);
1342+
B = randn(T, K, N);
1343+
C1 = randn(T, M, N);
1344+
end
1345+
C2 = copy(C1); C3 = copy(C1);
1346+
AtmulBpos!(C1, A, B)
1347+
AtmulBposavx!(C2, A, B)
1348+
AtmulBpos_avx!(C3, A, B)
1349+
@test C1 C2
1350+
@test C1 C3
12961351
end
12971352
end
12981353

@@ -1387,6 +1442,15 @@ end
13871442
end
13881443
end
13891444
end
1445+
myzero(A) = zero(eltype(A))
1446+
# function AmulBavx4!(C, A, B)
1447+
# @avx for m ∈ 1:size(A,1), n ∈ 1:size(B,2)
1448+
# C[m,n] = myzero(C)
1449+
# for k ∈ 1:size(A,2)
1450+
# C[m,n] += A[m,k] * B[k,n]
1451+
# end
1452+
# end
1453+
# end
13901454
function AmuladdBavx!(C, A, B, factor = 1)
13911455
@avx for m 1:size(A,1), n 1:size(B,2)
13921456
ΔCₘₙ = zero(eltype(C))
@@ -1457,6 +1521,21 @@ end
14571521
end
14581522
end
14591523
end
1524+
# function AmulB_avx4!(C, A, B)
1525+
# @_avx for m ∈ 1:size(A,1), n ∈ 1:size(B,2)
1526+
# C[m,n] = myzero(C)
1527+
# for k ∈ 1:size(A,2)
1528+
# C[m,n] += A[m,k] * B[k,n]
1529+
# end
1530+
# end
1531+
# end
1532+
# q = :(for m ∈ 1:size(A,1), n ∈ 1:size(B,2)
1533+
# C[m,n] = myzero(C)
1534+
# for k ∈ 1:size(A,2)
1535+
# C[m,n] += A[m,k] * B[k,n]
1536+
# end
1537+
# end)
1538+
# ls = LoopVectorization.LoopSet(q);
14601539
function AmuladdB_avx!(C, A, B, factor = 1)
14611540
@_avx for m 1:size(A,1), n 1:size(B,2)
14621541
ΔCₘₙ = zero(eltype(C))

0 commit comments

Comments
 (0)