Skip to content

Commit 7f7d281

Browse files
committed
don't reduce gather/scatter cost
1 parent 3c21e58 commit 7f7d281

File tree

2 files changed

+53
-53
lines changed

2 files changed

+53
-53
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.12.68"
4+
version = "0.12.69"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"

src/modeling/determinestrategy.jl

Lines changed: 52 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -45,61 +45,61 @@ function unitstride(ls::LoopSet, op::Operation, s::Symbol)
4545
end
4646

4747
function cost(ls::LoopSet, op::Operation, (u₁,u₂)::Tuple{Symbol,Symbol}, vloopsym::Symbol, Wshift::Int, size_T::Int = op.elementbytes)
48-
isconstant(op) && return 0.0, 0, 1.0#Float64(length(loopdependencies(op)) > 0)
49-
isloopvalue(op) && return 0.0, 0, 0.0
50-
instr = instruction(op)
51-
if length(parents(op)) == 1
52-
if instr == Instruction(:-) || instr === Instruction(:sub_fast) || instr == Instruction(:+) || instr == Instruction(:add_fast)
53-
return 0.0, 0, 0.0
54-
end
55-
elseif iscompute(op) &&
56-
(Base.sym_in(instruction(op).instr, (:vadd_nsw, :vsub_nsw, :(+), :(-), :add_fast, :sub_fast)) &&
57-
all(opp -> (isloopvalue(opp)), parents(op)))# || (reg_count(ls) == 32) && (instruction(op).instr === :ifelse))
58-
# all(opp -> (isloopvalue(opp) | isconstant(opp)), parents(op))
59-
return 0.0, 0, 0.0
60-
end
61-
opisvectorized = isvectorized(op)
62-
srt, sl, srp = opisvectorized ? vector_cost(instr, Wshift, size_T) : scalar_cost(instr)
63-
if accesses_memory(op)
64-
# either vbroadcast/reductionstore, vmov(a/u)pd, or gather/scatter
65-
if opisvectorized
66-
if !unitstride(ls, op, vloopsym)# || !isdense(op) # need gather/scatter
67-
indices = getindices(op)
68-
contigind = first(indices)
69-
shifter = max(2,Wshift)
70-
if rejectinterleave(op)
71-
offset = 0.0 # gather/scatter, alignment doesn't matter
72-
else
73-
shifter -= 1
74-
offset = 0.5reg_size(ls) / cache_lnsze(ls)
75-
end
76-
if shifter > 1 &&
77-
(!rejectcurly(op) && (((contigind === CONSTANTZEROINDEX) && ((length(indices) > 1) && (indices[2] === u₁) || (indices[2] === u₂))) ||
78-
((u₁ === contigind) | (u₂ === contigind))))
48+
isconstant(op) && return 0.0, 0, 1.0#Float64(length(loopdependencies(op)) > 0)
49+
isloopvalue(op) && return 0.0, 0, 0.0
50+
instr = instruction(op)
51+
if length(parents(op)) == 1
52+
if instr == Instruction(:-) || instr === Instruction(:sub_fast) || instr == Instruction(:+) || instr == Instruction(:add_fast)
53+
return 0.0, 0, 0.0
54+
end
55+
elseif iscompute(op) &&
56+
(Base.sym_in(instruction(op).instr, (:vadd_nsw, :vsub_nsw, :(+), :(-), :add_fast, :sub_fast)) &&
57+
all(opp -> (isloopvalue(opp)), parents(op)))# || (reg_count(ls) == 32) && (instruction(op).instr === :ifelse))
58+
# all(opp -> (isloopvalue(opp) | isconstant(opp)), parents(op))
59+
return 0.0, 0, 0.0
60+
end
61+
opisvectorized = isvectorized(op)
62+
srt, sl, srp = opisvectorized ? vector_cost(instr, Wshift, size_T) : scalar_cost(instr)
63+
if accesses_memory(op)
64+
# either vbroadcast/reductionstore, vmov(a/u)pd, or gather/scatter
65+
if opisvectorized
66+
if !unitstride(ls, op, vloopsym)# || !isdense(op) # need gather/scatter
67+
indices = getindices(op)
68+
contigind = first(indices)
69+
shifter = max(2,Wshift)
70+
if rejectinterleave(op)
71+
offset = 0.0 # gather/scatter, alignment doesn't matter
72+
else
73+
shifter -= 1
74+
offset = 0.5reg_size(ls) / cache_lnsze(ls)
75+
if shifter > 1 &&
76+
(!rejectcurly(op) && (((contigind === CONSTANTZEROINDEX) && ((length(indices) > 1) && (indices[2] === u₁) || (indices[2] === u₂))) ||
77+
((u₁ === contigind) | (u₂ === contigind))))
7978

80-
shifter -= 1
81-
offset = 0.5reg_size(ls) / cache_lnsze(ls)
82-
end
83-
r = 1 << shifter
84-
srt = srt*r + offset
85-
sl *= r
86-
elseif isload(op) & (length(loopdependencies(op)) > 1)# vmov(a/u)pd
87-
# penalize vectorized loads with more than 1 loopdep
88-
# heuristic; more than 1 loopdep means that many loads will not be aligned
89-
# Roughly corresponds to double-counting loads crossing cacheline boundaries
90-
# TODO: apparently the new ARM A64FX CPU (with 512 bit vectors) is NOT penalized for unaligned loads
91-
# would be nice to add a check for this CPU, to see if such a penalty is still appropriate.
92-
# Also, once more SVE (scalable vector extension) CPUs are released, would be nice to know if
93-
# this feature is common to all of them.
94-
srt += 0.5reg_size(ls) / cache_lnsze(ls)
95-
# srt += 0.25reg_size(ls) / cache_lnsze(ls)
96-
end
97-
elseif isstore(op) # broadcast or reductionstore; if store we want to penalize reduction
98-
srt *= 3
99-
sl *= 3
79+
shifter -= 1
80+
offset = 0.5reg_size(ls) / cache_lnsze(ls)
81+
end
10082
end
83+
r = 1 << shifter
84+
srt = srt*r + offset
85+
sl *= r
86+
elseif isload(op) & (length(loopdependencies(op)) > 1)# vmov(a/u)pd
87+
# penalize vectorized loads with more than 1 loopdep
88+
# heuristic; more than 1 loopdep means that many loads will not be aligned
89+
# Roughly corresponds to double-counting loads crossing cacheline boundaries
90+
# TODO: apparently the new ARM A64FX CPU (with 512 bit vectors) is NOT penalized for unaligned loads
91+
# would be nice to add a check for this CPU, to see if such a penalty is still appropriate.
92+
# Also, once more SVE (scalable vector extension) CPUs are released, would be nice to know if
93+
# this feature is common to all of them.
94+
srt += 0.5reg_size(ls) / cache_lnsze(ls)
95+
# srt += 0.25reg_size(ls) / cache_lnsze(ls)
96+
end
97+
elseif isstore(op) # broadcast or reductionstore; if store we want to penalize reduction
98+
srt *= 3
99+
sl *= 3
101100
end
102-
srt, sl, Float64(srp+1)
101+
end
102+
srt, sl, Float64(srp+1)
103103
end
104104

105105
# Base._return_type()

0 commit comments

Comments
 (0)