5
5
"""
6
6
function alignstores! (
7
7
f:: F , y:: DenseArray{T} ,
8
- args:: Vararg{<:DenseArray{<:NativeTypes },A}
9
- ) where {F, T <: NativeTypes , A}
8
+ args:: Vararg{<:DenseArray{<:Base.HWReal },A}
9
+ ) where {F, T <: Base.HWReal , A}
10
10
N = length (y)
11
11
ptry = pointer (y)
12
12
ptrargs = pointer .(args)
@@ -20,7 +20,7 @@ function alignstores!(
20
20
if N < i
21
21
m &= mask (T, N & (W - 1 ))
22
22
end
23
- vnoaliasstore! (ptry, extract_data ( f (vload .(V, ptrargs, m)... ) ), m)
23
+ vnoaliasstore! (ptry, f (vload .(V, ptrargs, m)... ), m)
24
24
gep (ptry, i), gep .(ptrargs, i), N - i
25
25
else
26
26
ptry, ptrargs, N
30
30
function vmap_singlethread! (
31
31
f:: F , y:: DenseArray{T} ,
32
32
:: Val{NonTemporal} ,
33
- args:: Vararg{<:DenseArray{<:NativeTypes },A}
34
- ) where {F,T <: NativeTypes , A, NonTemporal}
33
+ args:: Vararg{<:DenseArray{<:Base.HWReal },A}
34
+ ) where {F,T <: Base.HWReal , A, NonTemporal}
35
35
if NonTemporal
36
36
ptry, ptrargs, N = alignstores! (f, y, args... )
37
37
else
@@ -43,10 +43,10 @@ function vmap_singlethread!(
43
43
W = VectorizationBase. pick_vector_width (T)
44
44
V = VectorizationBase. pick_vector_width_val (T)
45
45
while i < N - ((W << 2 ) - 1 )
46
- v₁ = extract_data ( f (vload .(V, gep .(ptrargs, i ))... ) )
47
- v₂ = extract_data ( f (vload .(V, gep .(ptrargs, vadd (i, W)))... ) )
48
- v₃ = extract_data ( f (vload .(V, gep .(ptrargs, vadd (i, 2 W)))... ) )
49
- v₄ = extract_data ( f (vload .(V, gep .(ptrargs, vadd (i, 3 W)))... ) )
46
+ v₁ = f (vload .(V, gep .(ptrargs, i ))... )
47
+ v₂ = f (vload .(V, gep .(ptrargs, vadd (i, W)))... )
48
+ v₃ = f (vload .(V, gep .(ptrargs, vadd (i, 2 W)))... )
49
+ v₄ = f (vload .(V, gep .(ptrargs, vadd (i, 3 W)))... )
50
50
if NonTemporal
51
51
vstorent! (gep (ptry, i ), v₁)
52
52
vstorent! (gep (ptry, vadd (i, W)), v₂)
@@ -61,7 +61,7 @@ function vmap_singlethread!(
61
61
i = vadd (i, 4 W)
62
62
end
63
63
while i < N - (W - 1 ) # stops at 16 when
64
- vᵢ = extract_data ( f (vload .(V, gep .(ptrargs, i))... ) )
64
+ vᵢ = f (vload .(V, gep .(ptrargs, i))... )
65
65
if NonTemporal
66
66
vstorent! (gep (ptry, i), vᵢ)
67
67
else
@@ -71,7 +71,7 @@ function vmap_singlethread!(
71
71
end
72
72
if i < N
73
73
m = mask (T, N & (W - 1 ))
74
- vnoaliasstore! (gep (ptry, i), extract_data ( f (vload .(V, gep .(ptrargs, i), m)... ) ), m)
74
+ vnoaliasstore! (gep (ptry, i), f (vload .(V, gep .(ptrargs, i), m)... ), m)
75
75
end
76
76
y
77
77
end
@@ -80,7 +80,7 @@ function vmap_multithreaded!(
80
80
f:: F ,
81
81
y:: DenseArray{T} ,
82
82
:: Val{true} ,
83
- args:: Vararg{<:DenseArray{<:NativeTypes },A}
83
+ args:: Vararg{<:DenseArray{<:Base.HWReal },A}
84
84
) where {F,T,A}
85
85
ptry, ptrargs, N = alignstores! (f, y, args... )
86
86
N > 0 || return y
@@ -90,32 +90,32 @@ function vmap_multithreaded!(
90
90
Niter = N >>> Wsh
91
91
Base. Threads. @threads for j ∈ 0 : Niter- 1
92
92
i = j << Wsh
93
- v₁ = extract_data ( f (vload .(V, gep .(ptrargs, i ))... ) )
94
- v₂ = extract_data ( f (vload .(V, gep .(ptrargs, vadd (i, W)))... ) )
95
- v₃ = extract_data ( f (vload .(V, gep .(ptrargs, vadd (i, 2 W)))... ) )
96
- v₄ = extract_data ( f (vload .(V, gep .(ptrargs, vadd (i, 3 W)))... ) )
93
+ v₁ = f (vload .(V, gep .(ptrargs, i ))... )
94
+ v₂ = f (vload .(V, gep .(ptrargs, vadd (i, W)))... )
95
+ v₃ = f (vload .(V, gep .(ptrargs, vadd (i, 2 W)))... )
96
+ v₄ = f (vload .(V, gep .(ptrargs, vadd (i, 3 W)))... )
97
97
vstorent! (gep (ptry, i ), v₁)
98
98
vstorent! (gep (ptry, vadd (i, W)), v₂)
99
99
vstorent! (gep (ptry, vadd (i, 2 W)), v₃)
100
100
vstorent! (gep (ptry, vadd (i, 3 W)), v₄)
101
101
end
102
102
ii = Niter << Wsh
103
103
while ii < N - (W - 1 ) # stops at 16 when
104
- vᵢ = extract_data ( f (vload .(V, gep .(ptrargs, ii))... ) )
104
+ vᵢ = f (vload .(V, gep .(ptrargs, ii))... )
105
105
vstorent! (gep (ptry, ii), vᵢ)
106
106
ii = vadd (ii, W)
107
107
end
108
108
if ii < N
109
109
m = mask (T, N & (W - 1 ))
110
- vnoaliasstore! (gep (ptry, ii), extract_data ( f (vload .(V, gep .(ptrargs, ii), m)... ) ), m)
110
+ vnoaliasstore! (gep (ptry, ii), f (vload .(V, gep .(ptrargs, ii), m)... ), m)
111
111
end
112
112
y
113
113
end
114
114
function vmap_multithreaded! (
115
115
f:: F ,
116
116
y:: DenseArray{T} ,
117
117
:: Val{false} ,
118
- args:: Vararg{<:DenseArray{<:NativeTypes },A}
118
+ args:: Vararg{<:DenseArray{<:Base.HWReal },A}
119
119
) where {F,T,A}
120
120
N = length (y)
121
121
ptry = pointer (y)
@@ -127,24 +127,24 @@ function vmap_multithreaded!(
127
127
Niter = N >>> Wsh
128
128
Base. Threads. @threads for j ∈ 0 : Niter- 1
129
129
i = j << Wsh
130
- v₁ = extract_data ( f (vload .(V, gep .(ptrargs, i ))... ) )
131
- v₂ = extract_data ( f (vload .(V, gep .(ptrargs, vadd (i, W)))... ) )
132
- v₃ = extract_data ( f (vload .(V, gep .(ptrargs, vadd (i, 2 W)))... ) )
133
- v₄ = extract_data ( f (vload .(V, gep .(ptrargs, vadd (i, 3 W)))... ) )
130
+ v₁ = f (vload .(V, gep .(ptrargs, i ))... )
131
+ v₂ = f (vload .(V, gep .(ptrargs, vadd (i, W)))... )
132
+ v₃ = f (vload .(V, gep .(ptrargs, vadd (i, 2 W)))... )
133
+ v₄ = f (vload .(V, gep .(ptrargs, vadd (i, 3 W)))... )
134
134
vnoaliasstore! (gep (ptry, i ), v₁)
135
135
vnoaliasstore! (gep (ptry, vadd (i, W)), v₂)
136
136
vnoaliasstore! (gep (ptry, vadd (i, 2 W)), v₃)
137
137
vnoaliasstore! (gep (ptry, vadd (i, 3 W)), v₄)
138
138
end
139
139
ii = Niter << Wsh
140
140
while ii < N - (W - 1 ) # stops at 16 when
141
- vᵢ = extract_data ( f (vload .(V, gep .(ptrargs, ii))... ) )
141
+ vᵢ = f (vload .(V, gep .(ptrargs, ii))... )
142
142
vnoaliasstore! (gep (ptry, ii), vᵢ)
143
143
ii = vadd (ii, W)
144
144
end
145
145
if ii < N
146
146
m = mask (T, N & (W - 1 ))
147
- vnoaliasstore! (gep (ptry, ii), extract_data ( f (vload .(V, gep .(ptrargs, ii), m)... ) ), m)
147
+ vnoaliasstore! (gep (ptry, ii), f (vload .(V, gep .(ptrargs, ii), m)... ), m)
148
148
end
149
149
y
150
150
end
@@ -158,8 +158,8 @@ Vectorized-`map!`, applying `f` to each element of `a` (or paired elements of `a
158
158
and storing the result in `destination`.
159
159
"""
160
160
function vmap! (
161
- f:: F , y:: DenseArray{T} , args:: Vararg{<:DenseArray{<:NativeTypes },A}
162
- ) where {F,T<: NativeTypes ,A}
161
+ f:: F , y:: DenseArray{T} , args:: Vararg{<:DenseArray{<:Base.HWReal },A}
162
+ ) where {F,T<: Base.HWReal ,A}
163
163
vmap_singlethread! (f, y, Val {false} (), args... )
164
164
end
165
165
170
170
Like `vmap!` (see `vmap!`), but uses `Threads.@threads` for parallel execution.
171
171
"""
172
172
function vmapt! (
173
- f:: F , y:: DenseArray{T} , args:: Vararg{<:DenseArray{<:NativeTypes },A}
174
- ) where {F,T<: NativeTypes ,A}
173
+ f:: F , y:: DenseArray{T} , args:: Vararg{<:DenseArray{<:Base.HWReal },A}
174
+ ) where {F,T<: Base.HWReal ,A}
175
175
vmap_multithreaded! (f, y, Val {false} (), args... )
176
176
end
177
177
@@ -232,8 +232,8 @@ BenchmarkTools.Trial:
232
232
```
233
233
"""
234
234
function vmapnt! (
235
- f:: F , y:: DenseArray{T} , args:: Vararg{<:DenseArray{<:NativeTypes },A}
236
- ) where {F,T<: NativeTypes ,A}
235
+ f:: F , y:: DenseArray{T} , args:: Vararg{<:DenseArray{<:Base.HWReal },A}
236
+ ) where {F,T<: Base.HWReal ,A}
237
237
vmap_singlethread! (f, y, Val {true} (), args... )
238
238
end
239
239
243
243
Like `vmapnt!` (see `vmapnt!`), but uses `Threads.@threads` for parallel execution.
244
244
"""
245
245
function vmapntt! (
246
- f:: F , y:: DenseArray{T} , args:: Vararg{<:DenseArray{<:NativeTypes },A}
247
- ) where {F,T<: NativeTypes ,A}
246
+ f:: F , y:: DenseArray{T} , args:: Vararg{<:DenseArray{<:Base.HWReal },A}
247
+ ) where {F,T<: Base.HWReal ,A}
248
248
vmap_multithreaded! (f, y, Val {true} (), args... )
249
249
end
250
250
0 commit comments