Skip to content

Commit b4f79a6

Browse files
committed
1 parent 1e35018 commit b4f79a6

File tree

5 files changed

+37
-29
lines changed

5 files changed

+37
-29
lines changed

src/LoopVectorization.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ using Requires
5151

5252

5353
export LowDimArray, stridedpointer, indices,
54-
@avx, @avxt, @_avx, *ˡ, _avx_!,
54+
@avx, @avxt, @spmd, @spmdt, *ˡ, _avx_!,
5555
vmap, vmap!, vmapt, vmapt!, vmapnt, vmapnt!, vmapntt, vmapntt!,
5656
tanh_fast, sigmoid_fast,
5757
vfilter, vfilter!, vmapreduce, vreduce

src/constructors.jl

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ function process_args(args; inline = false, check_empty = false, u₁ = zero(Int
111111
end
112112
inline, check_empty, u₁, u₂, threads
113113
end
114-
function avx_macro(mod, src, q, args...)
114+
function spmd_macro(mod, src, q, args...)
115115
q = macroexpand(mod, q)
116116

117117
if q.head === :for
@@ -124,12 +124,12 @@ function avx_macro(mod, src, q, args...)
124124
end
125125
end
126126
"""
127-
@avx
127+
@spmd
128128
129129
Annotate a `for` loop, or a set of nested `for` loops whose bounds are constant across iterations, to optimize the computation. For example:
130130
131-
function AmulBavx!(C, A, B)
132-
@avx for m ∈ 1:size(A,1), n ∈ 1:size(B,2)
131+
function AmulB!(C, A, B)
132+
@spmd for m ∈ 1:size(A,1), n ∈ 1:size(B,2)
133133
Cₘₙ = zero(eltype(C))
134134
for k ∈ 1:size(A,2)
135135
Cₘₙ += A[m,k] * B[k,n]
@@ -148,23 +148,23 @@ julia> using LoopVectorization
148148
149149
julia> a = rand(100);
150150
151-
julia> b = @avx exp.(2 .* a);
151+
julia> b = @spmd exp.(2 .* a);
152152
153153
julia> c = similar(b);
154154
155-
julia> @avx @. c = exp(2a);
155+
julia> @spmd @. c = exp(2a);
156156
157157
julia> b ≈ c
158158
true
159159
```
160160
161161
# Extended help
162162
163-
Advanced users can customize the implementation of the `@avx`-annotated block
163+
Advanced users can customize the implementation of the `@spmd`-annotated block
164164
using keyword arguments:
165165
166166
```
167-
@avx inline=false unroll=2 body
167+
@spmd inline=false unroll=2 body
168168
```
169169
170170
where `body` is the code of the block (e.g., `for ... end`).
@@ -197,42 +197,42 @@ but it applies to the loop ordering and unrolling that will be chosen by LoopVec
197197
`uᵢ=0` (the default) indicates that LoopVectorization should pick its own value,
198198
and `uᵢ=-1` disables unrolling for the correspond loop.
199199
200-
The `@avx` macro also checks the array arguments using `LoopVectorization.check_args` to try and determine
200+
The `@spmd` macro also checks the array arguments using `LoopVectorization.check_args` to try and determine
201201
if they are compatible with the macro. If `check_args` returns false, a fall back loop annotated with `@inbounds`
202202
and `@fastmath` is generated. Note that `VectorizationBase` provides functions such as `vadd` and `vmul` that will
203-
ignore `@fastmath`, preserving IEEE semantics both within `@avx` and `@fastmath`.
203+
ignore `@fastmath`, preserving IEEE semantics both within `@spmd` and `@fastmath`.
204204
`check_args` currently returns false for some wrapper types like `LinearAlgebra.UpperTriangular`, requiring you to
205205
use their `parent`. Triangular loops aren't yet supported.
206206
"""
207-
macro avx(args...)
208-
avx_macro(__module__, __source__, last(args), Base.front(args)...)
207+
macro spmd(args...)
208+
spmd_macro(__module__, __source__, last(args), Base.front(args)...)
209209
end
210210
"""
211-
Equivalent to `@avx`, except it adds `thread=true` as the first keyword argument.
211+
Equivalent to `@spmd`, except it adds `thread=true` as the first keyword argument.
212212
Note that later arguments take precendence.
213213
214-
Meant for convenience, as `@avxt` is shorter than `@avx thread=true`.
214+
Meant for convenience, as `@spmdt` is shorter than `@spmd thread=true`.
215215
"""
216-
macro avxt(args...)
217-
avx_macro(__module__, __source__, last(args), :(thread=true), Base.front(args)...)
216+
macro spmdt(args...)
217+
spmd_macro(__module__, __source__, last(args), :(thread=true), Base.front(args)...)
218218
end
219219

220220
"""
221-
@_avx
221+
@_spmd
222222
223-
This macro transforms loops similarly to [`@avx`](@ref).
224-
While `@avx` punts to a generated function to enable type-based analysis, `_@avx`
225-
works on just the expressions. This requires that it makes a number of default assumptions. Use of `@avx` is preferred.
223+
This macro mostly exists for debugging/testing purposes. It does not support many of the use cases of [`@spmd`](@ref).
224+
It emits loops directly, rather than punting to an `@generated` function, meaning it doesn't have access to type
225+
information when generating code or analyzing the loops, often leading to bad performance.
226226
227-
This macro accepts the `inline` and `unroll` keyword arguments like `@avx`, but ignores the `check_empty` argument.
227+
This macro accepts the `inline` and `unroll` keyword arguments like `@spmd`, but ignores the `check_empty` argument.
228228
"""
229-
macro _avx(q)
229+
macro _spmd(q)
230230
q = macroexpand(__module__, q)
231231
ls = LoopSet(q, __module__)
232232
set_hw!(ls)
233233
esc(Expr(:block, ls.prepreamble, lower_and_split_loops(ls, -1)))
234234
end
235-
macro _avx(arg, q)
235+
macro _spmd(arg, q)
236236
@assert q.head === :for
237237
q = macroexpand(__module__, q)
238238
inline, check_empty, u₁, u₂ = check_macro_kwarg(arg, false, false, zero(Int8), zero(Int8), 1)
@@ -241,8 +241,14 @@ macro _avx(arg, q)
241241
esc(Expr(:block, ls.prepreamble, lower(ls, u₁ % Int, u₂ % Int, -1)))
242242
end
243243

244-
macro avx_debug(q)
244+
macro spmd_debug(q)
245245
q = macroexpand(__module__, q)
246246
ls = LoopSet(q, __module__)
247247
esc(LoopVectorization.setup_call_debug(ls))
248248
end
249+
250+
# define aliases
251+
const var"@avx" = var"@spmd"
252+
const var"@avxt" = var"@spmdt"
253+
const var"@avx_debug" = var"@spmd_debug"
254+

src/modeling/determinestrategy.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ function unroll_no_reductions(ls, order, vloopsym)
280280
u = demote_unroll_factor(ls, u, vloopsym)
281281
end
282282
remaining_reg = max(8, (reg_count(ls) - round(Int,rpc))) # spilling a few consts isn't so bad
283-
reg_constraint = max(1, remaining_reg ÷ round(Int,rpp))
283+
reg_constraint = max(1, remaining_reg ÷ max(1,round(Int,rpp)))
284284
clamp(u, 1, reg_constraint), unrolled
285285
# rt = max(compute_rt, load_rt + store_rt)
286286
# # (iszero(rt) ? 4 : max(1, roundpow2( min( 4, round(Int, 16 / rt) ) ))), unrolled

src/precompile.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ function _precompile_()
22
ccall(:jl_generating_output, Cint, ()) == 1 || return nothing
33
# Base.precompile(Tuple{typeof(which(_avx_!,(Val{UNROLL},Val{OPS},Val{ARF},Val{AM},Val{LPSYM},Tuple{LB, V},)).generator.gen),Any,Any,Any,Any,Any,Any,Any,Any,Type,Type,Type,Type,Any,Any}) # time: 1.0198073
44
# Base.precompile(Tuple{typeof(gespf1),Any,Tuple{Any, VectorizationBase.NullStep}}) # time: 0.1096832
5-
Base.precompile(Tuple{typeof(avx_macro),Module,LineNumberNode,Expr}) # time: 0.09183489
5+
Base.precompile(Tuple{typeof(spmd_macro),Module,LineNumberNode,Expr}) # time: 0.09183489
66
Base.precompile(Tuple{typeof(gespf1),StridedPointer{Float64, 1, 1, 0, (1,), Tuple{StaticInt{8}}, Tuple{StaticInt{1}}},Tuple{StaticInt{1}}}) # time: 0.05469272
77
Base.precompile(Tuple{typeof(zerorangestart),UnitRange{Int}}) # time: 0.04291692
88
Base.precompile(Tuple{Type{LoopSet},Symbol}) # time: 0.03362425
@@ -32,9 +32,9 @@ function _precompile_()
3232
Base.precompile(Tuple{typeof(array_reference_meta!),LoopSet,Symbol,SubArray{Any, 1, Vector{Any}, Tuple{UnitRange{Int}}, true},Int,Nothing}) # time: 0.008658403
3333
Base.precompile(Tuple{typeof(gespf1),StridedPointer{Int, 3, 1, 0, (1, 2, 3), Tuple{StaticInt{8}, Int, Int}, Tuple{StaticInt{1}, StaticInt{1}, StaticInt{1}}},Tuple{StaticInt{1}}}) # time: 0.008647886
3434
Base.precompile(Tuple{typeof(uniquearrayrefs_csesummary),LoopSet}) # time: 0.008355928
35-
Base.precompile(Tuple{typeof(avx_macro),Module,LineNumberNode,Expr,Expr,Vararg{Expr}}) # time: 0.007974428
35+
Base.precompile(Tuple{typeof(spmd_macro),Module,LineNumberNode,Expr,Expr,Vararg{Expr}}) # time: 0.007974428
3636
Base.precompile(Tuple{typeof(tryrefconvert),LoopSet,Expr,Int,Nothing}) # time: 0.007913027
37-
Base.precompile(Tuple{typeof(avx_macro),Module,LineNumberNode,Expr,Expr}) # time: 0.007347188
37+
Base.precompile(Tuple{typeof(spmd_macro),Module,LineNumberNode,Expr,Expr}) # time: 0.007347188
3838
Base.precompile(Tuple{typeof(show),IOContext{IOBuffer},Operation}) # time: 0.007273663
3939
Base.precompile(Tuple{typeof(loop_boundaries),LoopSet,Vector{Bool}}) # time: 0.006810827
4040
Base.precompile(Tuple{typeof(gespf1),StridedPointer{Float32, 4, 1, 0, (1, 2, 3, 4), Tuple{StaticInt{4}, Int, Int, Int}, NTuple{4, StaticInt{1}}},Tuple{StaticInt{-1}, StaticInt{-1}, StaticInt{1}, StaticInt{1}}}) # time: 0.006164707

test/testsetup.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
using Test
22
using LoopVectorization
33

4+
const var"@_avx" = LoopVectorization.var"@_spmd"
5+
46
using LinearAlgebra
57
function clenshaw(x, coeff)
68
len_c = length(coeff)

0 commit comments

Comments
 (0)