@@ -45,61 +45,61 @@ function unitstride(ls::LoopSet, op::Operation, s::Symbol)
45
45
end
46
46
47
47
function cost (ls:: LoopSet , op:: Operation , (u₁,u₂):: Tuple{Symbol,Symbol} , vloopsym:: Symbol , Wshift:: Int , size_T:: Int = op. elementbytes)
48
- isconstant (op) && return 0.0 , 0 , 1.0 # Float64(length(loopdependencies(op)) > 0)
49
- isloopvalue (op) && return 0.0 , 0 , 0.0
50
- instr = instruction (op)
51
- if length (parents (op)) == 1
52
- if instr == Instruction (:- ) || instr === Instruction (:sub_fast ) || instr == Instruction (:+ ) || instr == Instruction (:add_fast )
53
- return 0.0 , 0 , 0.0
54
- end
55
- elseif iscompute (op) &&
56
- (Base. sym_in (instruction (op). instr, (:vadd_nsw , :vsub_nsw , :(+ ), :(- ), :add_fast , :sub_fast )) &&
57
- all (opp -> (isloopvalue (opp)), parents (op)))# || (reg_count(ls) == 32) && (instruction(op).instr === :ifelse))
58
- # all(opp -> (isloopvalue(opp) | isconstant(opp)), parents(op))
59
- return 0.0 , 0 , 0.0
60
- end
61
- opisvectorized = isvectorized (op)
62
- srt, sl, srp = opisvectorized ? vector_cost (instr, Wshift, size_T) : scalar_cost (instr)
63
- if accesses_memory (op)
64
- # either vbroadcast/reductionstore, vmov(a/u)pd, or gather/scatter
65
- if opisvectorized
66
- if ! unitstride (ls, op, vloopsym)# || !isdense(op) # need gather/scatter
67
- indices = getindices (op)
68
- contigind = first (indices)
69
- shifter = max (2 ,Wshift)
70
- if rejectinterleave (op)
71
- offset = 0.0 # gather/scatter, alignment doesn't matter
72
- else
73
- shifter -= 1
74
- offset = 0.5 reg_size (ls) / cache_lnsze (ls)
75
- end
76
- if shifter > 1 &&
77
- (! rejectcurly (op) && (((contigind === CONSTANTZEROINDEX) && ((length (indices) > 1 ) && (indices[2 ] === u₁) || (indices[2 ] === u₂))) ||
78
- ((u₁ === contigind) | (u₂ === contigind))))
48
+ isconstant (op) && return 0.0 , 0 , 1.0 # Float64(length(loopdependencies(op)) > 0)
49
+ isloopvalue (op) && return 0.0 , 0 , 0.0
50
+ instr = instruction (op)
51
+ if length (parents (op)) == 1
52
+ if instr == Instruction (:- ) || instr === Instruction (:sub_fast ) || instr == Instruction (:+ ) || instr == Instruction (:add_fast )
53
+ return 0.0 , 0 , 0.0
54
+ end
55
+ elseif iscompute (op) &&
56
+ (Base. sym_in (instruction (op). instr, (:vadd_nsw , :vsub_nsw , :(+ ), :(- ), :add_fast , :sub_fast )) &&
57
+ all (opp -> (isloopvalue (opp)), parents (op)))# || (reg_count(ls) == 32) && (instruction(op).instr === :ifelse))
58
+ # all(opp -> (isloopvalue(opp) | isconstant(opp)), parents(op))
59
+ return 0.0 , 0 , 0.0
60
+ end
61
+ opisvectorized = isvectorized (op)
62
+ srt, sl, srp = opisvectorized ? vector_cost (instr, Wshift, size_T) : scalar_cost (instr)
63
+ if accesses_memory (op)
64
+ # either vbroadcast/reductionstore, vmov(a/u)pd, or gather/scatter
65
+ if opisvectorized
66
+ if ! unitstride (ls, op, vloopsym)# || !isdense(op) # need gather/scatter
67
+ indices = getindices (op)
68
+ contigind = first (indices)
69
+ shifter = max (2 ,Wshift)
70
+ if rejectinterleave (op)
71
+ offset = 0.0 # gather/scatter, alignment doesn't matter
72
+ else
73
+ shifter -= 1
74
+ offset = 0.5 reg_size (ls) / cache_lnsze (ls)
75
+ if shifter > 1 &&
76
+ (! rejectcurly (op) && (((contigind === CONSTANTZEROINDEX) && ((length (indices) > 1 ) && (indices[2 ] === u₁) || (indices[2 ] === u₂))) ||
77
+ ((u₁ === contigind) | (u₂ === contigind))))
79
78
80
- shifter -= 1
81
- offset = 0.5 reg_size (ls) / cache_lnsze (ls)
82
- end
83
- r = 1 << shifter
84
- srt = srt* r + offset
85
- sl *= r
86
- elseif isload (op) & (length (loopdependencies (op)) > 1 )# vmov(a/u)pd
87
- # penalize vectorized loads with more than 1 loopdep
88
- # heuristic; more than 1 loopdep means that many loads will not be aligned
89
- # Roughly corresponds to double-counting loads crossing cacheline boundaries
90
- # TODO : apparently the new ARM A64FX CPU (with 512 bit vectors) is NOT penalized for unaligned loads
91
- # would be nice to add a check for this CPU, to see if such a penalty is still appropriate.
92
- # Also, once more SVE (scalable vector extension) CPUs are released, would be nice to know if
93
- # this feature is common to all of them.
94
- srt += 0.5 reg_size (ls) / cache_lnsze (ls)
95
- # srt += 0.25reg_size(ls) / cache_lnsze(ls)
96
- end
97
- elseif isstore (op) # broadcast or reductionstore; if store we want to penalize reduction
98
- srt *= 3
99
- sl *= 3
79
+ shifter -= 1
80
+ offset = 0.5 reg_size (ls) / cache_lnsze (ls)
81
+ end
100
82
end
83
+ r = 1 << shifter
84
+ srt = srt* r + offset
85
+ sl *= r
86
+ elseif isload (op) & (length (loopdependencies (op)) > 1 )# vmov(a/u)pd
87
+ # penalize vectorized loads with more than 1 loopdep
88
+ # heuristic; more than 1 loopdep means that many loads will not be aligned
89
+ # Roughly corresponds to double-counting loads crossing cacheline boundaries
90
+ # TODO : apparently the new ARM A64FX CPU (with 512 bit vectors) is NOT penalized for unaligned loads
91
+ # would be nice to add a check for this CPU, to see if such a penalty is still appropriate.
92
+ # Also, once more SVE (scalable vector extension) CPUs are released, would be nice to know if
93
+ # this feature is common to all of them.
94
+ srt += 0.5 reg_size (ls) / cache_lnsze (ls)
95
+ # srt += 0.25reg_size(ls) / cache_lnsze(ls)
96
+ end
97
+ elseif isstore (op) # broadcast or reductionstore; if store we want to penalize reduction
98
+ srt *= 3
99
+ sl *= 3
101
100
end
102
- srt, sl, Float64 (srp+ 1 )
101
+ end
102
+ srt, sl, Float64 (srp+ 1 )
103
103
end
104
104
105
105
# Base._return_type()
0 commit comments