1
- # Expression-generator for vmap!
2
- function vmap_quote (N, :: Type{T} ) where {T}
3
- W, Wshift = VectorizationBase. pick_vector_width_shift (T)
4
- val = Expr (:call , Expr (:curly , :Val , W))
5
- q = Expr (:block , Expr (:(= ), :M , Expr (:call , :length , :dest )), Expr (:(= ), :vdest , Expr (:call , :pointer , :dest )), Expr (:(= ), :m , 0 ))
6
- fcall = Expr (:call , :f )
7
- loopbody = Expr (:block , Expr (:call , :vstore! , Expr (:call , :gep , :vdest , :m ), fcall), Expr (:(+= ), :m , W))
8
- fcallmask = Expr (:call , :f )
9
- bodymask = Expr (:block , Expr (:(= ), :__mask__ , Expr (:call , :mask , val, Expr (:call , :& , :M , W- 1 ))), Expr (:call , :vstore! , Expr (:call , :gep , :vdest , :m ), fcallmask, :__mask__ ))
10
- for n ∈ 1 : N
11
- arg_n = Symbol (:varg_ ,n)
12
- push! (q. args, Expr (:(= ), arg_n, Expr (:macrocall , Symbol (" @inbounds" ), LineNumberNode (@__LINE__ ,Symbol (@__FILE__ )), Expr (:call , :pointer , Expr (:ref , :args , n)))))
13
- push! (fcall. args, Expr (:call , :vload , val, Expr (:call , :gep , arg_n, :m )))
14
- push! (fcallmask. args, Expr (:call , :vload , val, Expr (:call , :gep , arg_n, :m ), :__mask__ ))
15
- end
16
- loop = Expr (:for , Expr (:(= ), :_ , Expr (:call , :(:), 0 , Expr (:call , :- , Expr (:call , :(>>> ), :M , Wshift), 1 ))), loopbody)
17
- push! (q. args, loop)
18
- ifmask = Expr (:if , Expr (:call , :(!= ), :m , :M ), bodymask)
19
- push! (q. args, ifmask)
20
- push! (q. args, :dest )
21
- q
22
- end
23
- """
24
- vmap!(f, destination, a::AbstractArray)
25
- vmap!(f, destination, a::AbstractArray, b::AbstractArray, ...)
26
1
27
- Vectorized-`map!`, applying `f` to each element of `a` (or paired elements of `a`, `b`, ...)
28
- and storing the result in `destination`.
29
2
"""
30
- @generated function vmap! (f:: F , dest:: AbstractArray{T} , args:: Vararg{<:AbstractArray,N} ) where {F,T,N}
31
- # do not change argnames here without compensatory changes in vmap_quote
32
- vmap_quote (N, T)
33
- end
34
-
3
+ `vstorent!` (non-temporal store) requires data to be aligned.
4
+ `alignstores!` will align `y` in preparation for the non-temporal maps.
5
+ """
35
6
function alignstores! (f:: F , y:: AbstractVector{T} , args:: Vararg{<:Any,A} ) where {F,T,A}
36
7
N = length (y)
37
8
ptry = pointer (y)
@@ -46,13 +17,129 @@ function alignstores!(f::F, y::AbstractVector{T}, args::Vararg{<:Any,A}) where {
46
17
if N < i
47
18
m &= mask (T, N & (W - 1 ))
48
19
end
49
- vstore ! (ptry, extract_data (f (vload .(V, ptrargs, m)... )), m)
20
+ vnoaliasstore ! (ptry, extract_data (f (vload .(V, ptrargs, m)... )), m)
50
21
gep (ptry, i), gep .(ptrargs, i), N - i
51
22
else
52
23
ptry, ptrargs, N
53
24
end
54
25
end
55
26
27
+ function vmap_singlethread! (f:: F , y:: AbstractVector{T} , :: Val{NonTemporal} , args:: Vararg{<:Any,A} ) where {F,T,A,NonTemporal}
28
+ if NonTemporal
29
+ ptry, ptrargs, N = alignstores! (f, y, args... )
30
+ else
31
+ N = length (y)
32
+ ptry = pointer (y)
33
+ ptrargs = pointer .(args)
34
+ end
35
+ i = 0
36
+ W = VectorizationBase. pick_vector_width (T)
37
+ V = VectorizationBase. pick_vector_width_val (T)
38
+ while i < N - ((W << 2 ) - 1 )
39
+ v₁ = extract_data (f (vload .(V, gep .(ptrargs, i ))... ))
40
+ v₂ = extract_data (f (vload .(V, gep .(ptrargs, vadd (i, W)))... ))
41
+ v₃ = extract_data (f (vload .(V, gep .(ptrargs, vadd (i, 2 W)))... ))
42
+ v₄ = extract_data (f (vload .(V, gep .(ptrargs, vadd (i, 3 W)))... ))
43
+ if NonTemporal
44
+ vstorent! (gep (ptry, i ), v₁)
45
+ vstorent! (gep (ptry, vadd (i, W)), v₂)
46
+ vstorent! (gep (ptry, vadd (i, 2 W)), v₃)
47
+ vstorent! (gep (ptry, vadd (i, 3 W)), v₄)
48
+ else
49
+ vnoaliasstore! (gep (ptry, i ), v₁)
50
+ vnoaliasstore! (gep (ptry, vadd (i, W)), v₂)
51
+ vnoaliasstore! (gep (ptry, vadd (i, 2 W)), v₃)
52
+ vnoaliasstore! (gep (ptry, vadd (i, 3 W)), v₄)
53
+ end
54
+ i = vadd (i, 4 W)
55
+ end
56
+ while i < N - (W - 1 ) # stops at 16 when
57
+ vᵢ = extract_data (f (vload .(V, gep .(ptrargs, i))... ))
58
+ if NonTemporal
59
+ vstorent! (gep (ptry, i), vᵢ)
60
+ else
61
+ vnoaliasstore! (gep (ptry, i), vᵢ)
62
+ end
63
+ i = vadd (i, W)
64
+ end
65
+ if i < N
66
+ m = mask (T, N & (W - 1 ))
67
+ vnoaliasstore! (gep (ptry, i), extract_data (f (vload .(V, gep .(ptrargs, i), m)... )), m)
68
+ end
69
+ y
70
+ end
71
+
72
+ function vmap_multithreaded! (f:: F , y:: AbstractVector{T} , :: Val{NonTemporal} , args:: Vararg{<:Any,A} ) where {F,T,A,NonTemporal}
73
+ if NonTemporal
74
+ ptry, ptrargs, N = alignstores! (f, y, args... )
75
+ else
76
+ N = length (y)
77
+ ptry = pointer (y)
78
+ ptrargs = pointer .(args)
79
+ end
80
+ N > 0 || return y
81
+ W, Wshift = VectorizationBase. pick_vector_width_shift (T)
82
+ V = VectorizationBase. pick_vector_width_val (T)
83
+ Wsh = Wshift + 2
84
+ Niter = N >>> Wsh
85
+ Base. Threads. @threads for j ∈ 0 : Niter- 1
86
+ i = j << Wsh
87
+ v₁ = extract_data (f (vload .(V, gep .(ptrargs, i ))... ))
88
+ v₂ = extract_data (f (vload .(V, gep .(ptrargs, vadd (i, W)))... ))
89
+ v₃ = extract_data (f (vload .(V, gep .(ptrargs, vadd (i, 2 W)))... ))
90
+ v₄ = extract_data (f (vload .(V, gep .(ptrargs, vadd (i, 3 W)))... ))
91
+ if NonTemporal
92
+ vstorent! (gep (ptry, i ), v₁)
93
+ vstorent! (gep (ptry, vadd (i, W)), v₂)
94
+ vstorent! (gep (ptry, vadd (i, 2 W)), v₃)
95
+ vstorent! (gep (ptry, vadd (i, 3 W)), v₄)
96
+ else
97
+ vnoaliasstore! (gep (ptry, i ), v₁)
98
+ vnoaliasstore! (gep (ptry, vadd (i, W)), v₂)
99
+ vnoaliasstore! (gep (ptry, vadd (i, 2 W)), v₃)
100
+ vnoaliasstore! (gep (ptry, vadd (i, 3 W)), v₄)
101
+ end
102
+ end
103
+ ii = Niter << Wsh
104
+ while ii < N - (W - 1 ) # stops at 16 when
105
+ vᵢ = extract_data (f (vload .(V, gep .(ptrargs, ii))... ))
106
+ if NonTemporal
107
+ vstorent! (gep (ptry, ii), vᵢ)
108
+ else
109
+ vnoaliasstore! (gep (ptry, ii), vᵢ)
110
+ end
111
+ ii = vadd (ii, W)
112
+ end
113
+ if ii < N
114
+ m = mask (T, N & (W - 1 ))
115
+ vnoaliasstore! (gep (ptry, ii), extract_data (f (vload .(V, gep .(ptrargs, ii), m)... )), m)
116
+ end
117
+ y
118
+ end
119
+
120
+
121
+ """
122
+ vmap!(f, destination, a::AbstractArray)
123
+ vmap!(f, destination, a::AbstractArray, b::AbstractArray, ...)
124
+
125
+ Vectorized-`map!`, applying `f` to each element of `a` (or paired elements of `a`, `b`, ...)
126
+ and storing the result in `destination`.
127
+ """
128
+ function vmap! (f:: F , y:: AbstractVector{T} , args:: Vararg{<:Any,A} ) where {F,T,A}
129
+ vmap_singlethread! (f, y, Val {false} (), args... )
130
+ end
131
+
132
+
133
+ """
134
+ vmapt!(::Function, dest, args...)
135
+
136
+ Like `vmap!` (see `vmap!`), but uses `Threads.@threads` for parallel execution.
137
+ """
138
+ function vmapt! (f:: F , y:: AbstractVector{T} , args:: Vararg{<:Any,A} ) where {F,T,A}
139
+ vmap_multithreaded! (f, y, Val {false} (), args... )
140
+ end
141
+
142
+
56
143
"""
57
144
vmapnt!(::Function, dest, args...)
58
145
@@ -109,24 +196,7 @@ BenchmarkTools.Trial:
109
196
```
110
197
"""
111
198
function vmapnt! (f:: F , y:: AbstractVector{T} , args:: Vararg{<:Any,A} ) where {F,T,A}
112
- ptry, ptrargs, N = alignstores! (f, y, args... )
113
- i = 0
114
- W = VectorizationBase. pick_vector_width (T)
115
- V = VectorizationBase. pick_vector_width_val (T)
116
- while i < N - ((W << 2 ) - 1 )
117
- vstorent! (gep (ptry, i), extract_data (f (vload .(V, gep .(ptrargs, i))... ))); i += W
118
- vstorent! (gep (ptry, i), extract_data (f (vload .(V, gep .(ptrargs, i))... ))); i += W
119
- vstorent! (gep (ptry, i), extract_data (f (vload .(V, gep .(ptrargs, i))... ))); i += W
120
- vstorent! (gep (ptry, i), extract_data (f (vload .(V, gep .(ptrargs, i))... ))); i += W
121
- end
122
- while i < N - (W - 1 ) # stops at 16 when
123
- vstorent! (gep (ptry, i), extract_data (f (vload .(V, gep .(ptrargs, i))... ))); i += W
124
- end
125
- if i < N
126
- m = mask (T, N & (W - 1 ))
127
- vstore! (gep (ptry, i), extract_data (f (vload .(V, gep .(ptrargs, i), m)... )), m)
128
- end
129
- y
199
+ vmap_singlethread! (f, y, Val {true} (), args... )
130
200
end
131
201
132
202
"""
135
205
Like `vmapnt!` (see `vmapnt!`), but uses `Threads.@threads` for parallel execution.
136
206
"""
137
207
function vmapntt! (f:: F , y:: AbstractVector{T} , args:: Vararg{<:Any,A} ) where {F,T,A}
138
- ptry, ptrargs, N = alignstores! (f, y, args... )
139
- N > 0 || return y
140
- W, Wshift = VectorizationBase. pick_vector_width_shift (T)
141
- V = VectorizationBase. pick_vector_width_val (T)
142
- Wsh = Wshift + 2
143
- Niter = N >>> Wsh
144
- Base. Threads. @threads for j ∈ 0 : Niter- 1
145
- i = j << Wsh
146
- vstorent! (gep (ptry, i), extract_data (f (vload .(V, gep .(ptrargs, i))... ))); i += W
147
- vstorent! (gep (ptry, i), extract_data (f (vload .(V, gep .(ptrargs, i))... ))); i += W
148
- vstorent! (gep (ptry, i), extract_data (f (vload .(V, gep .(ptrargs, i))... ))); i += W
149
- vstorent! (gep (ptry, i), extract_data (f (vload .(V, gep .(ptrargs, i))... )))
150
- end
151
- ii = Niter << Wsh
152
- while ii < N - (W - 1 ) # stops at 16 when
153
- vstorent! (gep (ptry, ii), extract_data (f (vload .(V, gep .(ptrargs, ii))... ))); ii += W
154
- end
155
- if ii < N
156
- m = mask (T, N & (W - 1 ))
157
- vstore! (gep (ptry, ii), extract_data (f (vload .(V, gep .(ptrargs, ii), m)... )), m)
158
- end
159
- y
208
+ vmap_multithreaded! (f, y, Val {true} (), args... )
160
209
end
161
210
162
211
function vmap_call (f:: F , vm!:: V , args:: Vararg{<:Any,N} ) where {V,F,N}
@@ -174,6 +223,14 @@ and returning a new array.
174
223
"""
175
224
vmap (f:: F , args:: Vararg{<:Any,N} ) where {F,N} = vmap_call (f, vmap!, args... )
176
225
226
+ """
227
+ vmapt(f, a::AbstractArray)
228
+ vmapt(f, a::AbstractArray, b::AbstractArray, ...)
229
+
230
+ A threaded variant of [`vmap`](@ref).
231
+ """
232
+ vmapt (f:: F , args:: Vararg{<:Any,N} ) where {F,N} = vmap_call (f, vmapt!, args... )
233
+
177
234
"""
178
235
vmapnt(f, a::AbstractArray)
179
236
vmapnt(f, a::AbstractArray, b::AbstractArray, ...)
0 commit comments