Skip to content

Commit fb90b62

Browse files
committed
Add additional check that a reduction is in fact a reduction before determining whether it can be eliminated.
1 parent fedddf7 commit fb90b62

File tree

4 files changed

+29
-4
lines changed

4 files changed

+29
-4
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.12.71"
4+
version = "0.12.72"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"

src/codegen/operation_evaluation_order.jl

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,15 +114,26 @@ function load_short_static_reduction_first!(ls::LoopSet, u₁loop::Symbol, u₂l
114114
for op operations(ls)
115115
iscompute(op) || continue
116116
length(reduceddependencies(op)) == 0 && continue
117+
parents_op = parents(op)
118+
length(parents_op) == 2 || continue
119+
found = false
120+
parent₁deps = loopdependencies(parents_op[1])
121+
parent₂deps = loopdependencies(parents_op[2])
122+
for reduced_dep reduceddependencies(op)
123+
if (reduced_dep parent₁deps) || (reduced_dep parent₂deps)
124+
found = true
125+
break
126+
end
127+
end
128+
found || continue
117129
if (instruction(op).instr === :reduced_add)
118130
vecloop = getloop(ls, vectorized)
119131
if isstaticloop(vecloop) && (length(vecloop) 16) && nounrollreduction(op, u₁loop, u₂loop, vectorized)
120132
opsub = parents(op)[2]
121133
length(children(opsub)) == 1 || continue
122134
opsearch = parents(op)[1]
123135
opcheck = search_for_reductinit!(opsearch, opsub, name(opsearch), loopdependencies(op))
124-
opcheck === opsearch || replace_reduct_init!(ls, op, opsub, opcheck)
125-
136+
opcheck === opsearch || replace_reduct_init!(ls, op, opsub, opcheck)
126137
end
127138
elseif (instruction(op).instr === :add_fast) && (instruction(first(parents(op))).instr === :identity)
128139
vecloop = getloop(ls, vectorized)

src/reconstruct_loopset.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -726,7 +726,7 @@ Execute an `@turbo` block. The block's code is represented via the arguments:
726726
)
727727
else
728728
# Main.BODY[] = avx_body(ls, var"#UNROLL#")
729-
# @show avx_body(ls, var"#UNROLL#")
729+
# return @show avx_body(ls, var"#UNROLL#")
730730
avx_body(ls, var"#UNROLL#")
731731
end
732732
# @show var"#UNROLL#", var"#OPS#", var"#ARF#", var"#AM#", var"#LPSYM#", var"#LB#"

test/gemm.jl

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,6 +558,17 @@
558558
end
559559
return C
560560
end
561+
function dense!(f::F, C, A, B) where {F}
562+
K = ArrayInterface.size(A, StaticInt(2))
563+
Kp1 = K + StaticInt(1)
564+
@turbo for n indices((B,C),2), m indices((A,C),1)
565+
Cmn = zero(eltype(C))
566+
for k 1:K
567+
Cmn += A[m,k] * B[k,n]
568+
end
569+
C[m,n] = f(Cmn + A[m,Kp1])
570+
end
571+
end
561572

562573
# TODO: add fast=false option to `@turbo`
563574
# function gemm_accurate!(C, A, B)
@@ -712,6 +723,9 @@
712723
At = copy(A');
713724
Bt = copy(B');
714725
C2 = similar(C);
726+
A2 = rand(R, M, K+1)
727+
dense!(VectorizationBase.relu, C, A2, B);
728+
@test C VectorizationBase.relu.(@view(A2[:,begin:end-1]) * B .+ @view(A2[:,end]))
715729
@testset "avx $T dynamc gemm" begin
716730
AmulB!(C2, A, B)
717731
AmulBavx1!(C, A, B)

0 commit comments

Comments
 (0)