1
1
2
- const DenseNativeArray = DenseArray{<: NativeTypes }
3
-
4
2
"""
5
3
`vstorent!` (non-temporal store) requires data to be aligned.
6
4
`alignstores!` will align `y` in preparation for the non-temporal maps.
7
5
"""
8
6
function alignstores! (
9
- f:: F , y:: DenseArray {T} ,
10
- args:: Vararg{DenseNativeArray ,A}
7
+ f:: F , y:: AbstractArray {T} ,
8
+ args:: Vararg{AbstractArray ,A}
11
9
) where {F, T <: Base.HWReal , A}
12
10
N = length (y)
13
11
ptry = VectorizationBase. zstridedpointer (y)
@@ -32,9 +30,9 @@ function alignstores!(
32
30
end
33
31
34
32
function vmap_singlethread! (
35
- f:: F , y:: DenseArray {T} ,
33
+ f:: F , y:: AbstractArray {T} ,
36
34
:: Val{NonTemporal} ,
37
- args:: Vararg{DenseNativeArray ,A}
35
+ args:: Vararg{AbstractArray ,A}
38
36
) where {F,T <: Base.HWReal , A, NonTemporal}
39
37
if NonTemporal # if stores into `y` aren't aligned, we'll get a crash
40
38
ptry, ptrargs, N = alignstores! (f, y, args... )
@@ -60,36 +58,36 @@ function vmap_singlethread!(
60
58
end
61
59
i = vadd_fast (i, StaticInt {UNROLL} () * W)
62
60
end
63
- if Base. libllvm_version ≥ v " 11"
64
- Nm1 = vsub_fast (N, 1 )
65
- while i < N # stops at 16 when
66
- m = mask (V, i, Nm1)
67
- vnoaliasstore! (ptry, f (vload .(ptrargs, ((MM {W} (i),),), m)... ), (MM {W} (i,),), m)
68
- i = vadd_fast (i, W)
69
- end
70
- else
71
- while i < N - (W - 1 ) # stops at 16 when
72
- vᵣ = f (vload .(ptrargs, ((MM {W} (i),),))... )
73
- if NonTemporal
74
- vstorent! (ptry, vᵣ, (MM {W} (i),))
75
- else
76
- vnoaliasstore! (ptry, vᵣ, (MM {W} (i),))
77
- end
78
- i = vadd_fast (i, W)
79
- end
80
- if i < N
81
- m = mask (T, N & (W - 1 ))
82
- vnoaliasstore! (ptry, f (vload .(ptrargs, ((MM {W} (i),),), m)... ), (MM {W} (i,),), m)
61
+ # if Base.libllvm_version ≥ v"11" # this seems to be slower
62
+ # Nm1 = vsub_fast(N, 1)
63
+ # while i < N # stops at 16 when
64
+ # m = mask(V, i, Nm1)
65
+ # vnoaliasstore!(ptry, f(vload.(ptrargs, ((MM{W}(i),),), m)...), (MM{W}(i,),), m)
66
+ # i = vadd_fast(i, W)
67
+ # end
68
+ # else
69
+ while i < N - (W - 1 ) # stops at 16 when
70
+ vᵣ = f (vload .(ptrargs, ((MM {W} (i),),))... )
71
+ if NonTemporal
72
+ vstorent! (ptry, vᵣ, (MM {W} (i),))
73
+ else
74
+ vnoaliasstore! (ptry, vᵣ, (MM {W} (i),))
83
75
end
76
+ i = vadd_fast (i, W)
77
+ end
78
+ if i < N
79
+ m = mask (T, N & (W - 1 ))
80
+ vnoaliasstore! (ptry, f (vload .(ptrargs, ((MM {W} (i),),), m)... ), (MM {W} (i,),), m)
84
81
end
82
+ # end
85
83
y
86
84
end
87
85
88
86
function vmap_multithreaded! (
89
87
f:: F ,
90
- y:: DenseArray {T} ,
88
+ y:: AbstractArray {T} ,
91
89
:: Val{true} ,
92
- args:: Vararg{DenseNativeArray ,A}
90
+ args:: Vararg{AbstractArray ,A}
93
91
) where {F,T,A}
94
92
ptry, ptrargs, N = alignstores! (f, y, args... )
95
93
N > 0 || return y
@@ -114,9 +112,9 @@ function vmap_multithreaded!(
114
112
end
115
113
function vmap_multithreaded! (
116
114
f:: F ,
117
- y:: DenseArray {T} ,
115
+ y:: AbstractArray {T} ,
118
116
:: Val{false} ,
119
- args:: Vararg{DenseNativeArray ,A}
117
+ args:: Vararg{AbstractArray ,A}
120
118
) where {F,T,A}
121
119
N = length (y)
122
120
ptry = VectorizationBase. zstridedpointer (y)
@@ -142,6 +140,10 @@ function vmap_multithreaded!(
142
140
y
143
141
end
144
142
143
+ Base. @pure _all_dense (:: ArrayInterface.DenseDims{D} ) where {D} = all (D)
144
+ @inline all_dense () = true
145
+ @inline all_dense (A:: AbstractArray ) = _all_dense (ArrayInterface. dense_dims (A))
146
+ @inline all_dense (A:: AbstractArray , B:: AbstractArray , C:: Vararg{AbstractArray,K} ) where {K} = all_dense (A) && all_dense (B, C... )
145
147
146
148
"""
147
149
vmap!(f, destination, a::AbstractArray)
@@ -151,9 +153,13 @@ Vectorized-`map!`, applying `f` to each element of `a` (or paired elements of `a
151
153
and storing the result in `destination`.
152
154
"""
153
155
function vmap! (
154
- f:: F , y:: DenseArray{T} , args:: Vararg{DenseNativeArray,A}
155
- ) where {F,T<: Base.HWReal ,A}
156
- vmap_singlethread! (f, y, Val {false} (), args... )
156
+ f:: F , y:: AbstractArray , args:: Vararg{AbstractArray,A}
157
+ ) where {F,A}
158
+ if check_args (y, args... ) && all_dense (y, args... )
159
+ vmap_singlethread! (f, y, Val {false} (), args... )
160
+ else
161
+ map! (f, y, args... )
162
+ end
157
163
end
158
164
159
165
163
169
Like `vmap!` (see `vmap!`), but uses `Threads.@threads` for parallel execution.
164
170
"""
165
171
function vmapt! (
166
- f:: F , y:: DenseArray{T} , args:: Vararg{DenseNativeArray,A}
167
- ) where {F,T<: Base.HWReal ,A}
168
- vmap_multithreaded! (f, y, Val {false} (), args... )
172
+ f:: F , y:: AbstractArray , args:: Vararg{AbstractArray,A}
173
+ ) where {F,A}
174
+ if check_args (y, args... ) && all_dense (y, args... )
175
+ vmap_multithreaded! (f, y, Val {false} (), args... )
176
+ else
177
+ map! (f, y, args... )
178
+ end
169
179
end
170
180
171
181
@@ -225,9 +235,13 @@ BenchmarkTools.Trial:
225
235
```
226
236
"""
227
237
function vmapnt! (
228
- f:: F , y:: DenseArray{T} , args:: Vararg{DenseNativeArray,A}
229
- ) where {F,T<: Base.HWReal ,A}
230
- vmap_singlethread! (f, y, Val {true} (), args... )
238
+ f:: F , y:: AbstractArray , args:: Vararg{AbstractArray,A}
239
+ ) where {F,A}
240
+ if check_args (y, args... ) && all_dense (y, args... )
241
+ vmap_singlethread! (f, y, Val {true} (), args... )
242
+ else
243
+ map! (f, y, args... )
244
+ end
231
245
end
232
246
233
247
"""
236
250
Like `vmapnt!` (see `vmapnt!`), but uses `Threads.@threads` for parallel execution.
237
251
"""
238
252
function vmapntt! (
239
- f:: F , y:: DenseArray{T} , args:: Vararg{DenseNativeArray,A}
240
- ) where {F,T<: Base.HWReal ,A}
241
- vmap_multithreaded! (f, y, Val {true} (), args... )
253
+ f:: F , y:: AbstractArray , args:: Vararg{AbstractArray,A}
254
+ ) where {F,A}
255
+ if check_args (y, args... ) && all_dense (y, args... )
256
+ vmap_multithreaded! (f, y, Val {true} (), args... )
257
+ else
258
+ map! (f, y, args... )
259
+ end
242
260
end
243
261
244
262
# generic fallbacks
0 commit comments