Skip to content

Commit 39d2a1d

Browse files
committed
Temporary workaround for Bool-ean vectors yielding the wrong answer.
1 parent c21c174 commit 39d2a1d

File tree

2 files changed

+34
-34
lines changed

2 files changed

+34
-34
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.8.24"
4+
version = "0.8.25"
55

66
[deps]
77
DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"

src/map.jl

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
"""
66
function alignstores!(
77
f::F, y::DenseArray{T},
8-
args::Vararg{<:DenseArray{<:NativeTypes},A}
9-
) where {F, T <: NativeTypes, A}
8+
args::Vararg{<:DenseArray{<:Base.HWReal},A}
9+
) where {F, T <: Base.HWReal, A}
1010
N = length(y)
1111
ptry = pointer(y)
1212
ptrargs = pointer.(args)
@@ -20,7 +20,7 @@ function alignstores!(
2020
if N < i
2121
m &= mask(T, N & (W - 1))
2222
end
23-
vnoaliasstore!(ptry, extract_data(f(vload.(V, ptrargs, m)...)), m)
23+
vnoaliasstore!(ptry, f(vload.(V, ptrargs, m)...), m)
2424
gep(ptry, i), gep.(ptrargs, i), N - i
2525
else
2626
ptry, ptrargs, N
@@ -30,8 +30,8 @@ end
3030
function vmap_singlethread!(
3131
f::F, y::DenseArray{T},
3232
::Val{NonTemporal},
33-
args::Vararg{<:DenseArray{<:NativeTypes},A}
34-
) where {F,T <: NativeTypes, A, NonTemporal}
33+
args::Vararg{<:DenseArray{<:Base.HWReal},A}
34+
) where {F,T <: Base.HWReal, A, NonTemporal}
3535
if NonTemporal
3636
ptry, ptrargs, N = alignstores!(f, y, args...)
3737
else
@@ -43,10 +43,10 @@ function vmap_singlethread!(
4343
W = VectorizationBase.pick_vector_width(T)
4444
V = VectorizationBase.pick_vector_width_val(T)
4545
while i < N - ((W << 2) - 1)
46-
v₁ = extract_data(f(vload.(V, gep.(ptrargs, i ))...))
47-
v₂ = extract_data(f(vload.(V, gep.(ptrargs, vadd(i, W)))...))
48-
v₃ = extract_data(f(vload.(V, gep.(ptrargs, vadd(i, 2W)))...))
49-
v₄ = extract_data(f(vload.(V, gep.(ptrargs, vadd(i, 3W)))...))
46+
v₁ = f(vload.(V, gep.(ptrargs, i ))...)
47+
v₂ = f(vload.(V, gep.(ptrargs, vadd(i, W)))...)
48+
v₃ = f(vload.(V, gep.(ptrargs, vadd(i, 2W)))...)
49+
v₄ = f(vload.(V, gep.(ptrargs, vadd(i, 3W)))...)
5050
if NonTemporal
5151
vstorent!(gep(ptry, i ), v₁)
5252
vstorent!(gep(ptry, vadd(i, W)), v₂)
@@ -61,7 +61,7 @@ function vmap_singlethread!(
6161
i = vadd(i, 4W)
6262
end
6363
while i < N - (W - 1) # stops at 16 when
64-
vᵢ = extract_data(f(vload.(V, gep.(ptrargs, i))...))
64+
vᵢ = f(vload.(V, gep.(ptrargs, i))...)
6565
if NonTemporal
6666
vstorent!(gep(ptry, i), vᵢ)
6767
else
@@ -71,7 +71,7 @@ function vmap_singlethread!(
7171
end
7272
if i < N
7373
m = mask(T, N & (W - 1))
74-
vnoaliasstore!(gep(ptry, i), extract_data(f(vload.(V, gep.(ptrargs, i), m)...)), m)
74+
vnoaliasstore!(gep(ptry, i), f(vload.(V, gep.(ptrargs, i), m)...), m)
7575
end
7676
y
7777
end
@@ -80,7 +80,7 @@ function vmap_multithreaded!(
8080
f::F,
8181
y::DenseArray{T},
8282
::Val{true},
83-
args::Vararg{<:DenseArray{<:NativeTypes},A}
83+
args::Vararg{<:DenseArray{<:Base.HWReal},A}
8484
) where {F,T,A}
8585
ptry, ptrargs, N = alignstores!(f, y, args...)
8686
N > 0 || return y
@@ -90,32 +90,32 @@ function vmap_multithreaded!(
9090
Niter = N >>> Wsh
9191
Base.Threads.@threads for j 0:Niter-1
9292
i = j << Wsh
93-
v₁ = extract_data(f(vload.(V, gep.(ptrargs, i ))...))
94-
v₂ = extract_data(f(vload.(V, gep.(ptrargs, vadd(i, W)))...))
95-
v₃ = extract_data(f(vload.(V, gep.(ptrargs, vadd(i, 2W)))...))
96-
v₄ = extract_data(f(vload.(V, gep.(ptrargs, vadd(i, 3W)))...))
93+
v₁ = f(vload.(V, gep.(ptrargs, i ))...)
94+
v₂ = f(vload.(V, gep.(ptrargs, vadd(i, W)))...)
95+
v₃ = f(vload.(V, gep.(ptrargs, vadd(i, 2W)))...)
96+
v₄ = f(vload.(V, gep.(ptrargs, vadd(i, 3W)))...)
9797
vstorent!(gep(ptry, i ), v₁)
9898
vstorent!(gep(ptry, vadd(i, W)), v₂)
9999
vstorent!(gep(ptry, vadd(i, 2W)), v₃)
100100
vstorent!(gep(ptry, vadd(i, 3W)), v₄)
101101
end
102102
ii = Niter << Wsh
103103
while ii < N - (W - 1) # stops at 16 when
104-
vᵢ = extract_data(f(vload.(V, gep.(ptrargs, ii))...))
104+
vᵢ = f(vload.(V, gep.(ptrargs, ii))...)
105105
vstorent!(gep(ptry, ii), vᵢ)
106106
ii = vadd(ii, W)
107107
end
108108
if ii < N
109109
m = mask(T, N & (W - 1))
110-
vnoaliasstore!(gep(ptry, ii), extract_data(f(vload.(V, gep.(ptrargs, ii), m)...)), m)
110+
vnoaliasstore!(gep(ptry, ii), f(vload.(V, gep.(ptrargs, ii), m)...), m)
111111
end
112112
y
113113
end
114114
function vmap_multithreaded!(
115115
f::F,
116116
y::DenseArray{T},
117117
::Val{false},
118-
args::Vararg{<:DenseArray{<:NativeTypes},A}
118+
args::Vararg{<:DenseArray{<:Base.HWReal},A}
119119
) where {F,T,A}
120120
N = length(y)
121121
ptry = pointer(y)
@@ -127,24 +127,24 @@ function vmap_multithreaded!(
127127
Niter = N >>> Wsh
128128
Base.Threads.@threads for j 0:Niter-1
129129
i = j << Wsh
130-
v₁ = extract_data(f(vload.(V, gep.(ptrargs, i ))...))
131-
v₂ = extract_data(f(vload.(V, gep.(ptrargs, vadd(i, W)))...))
132-
v₃ = extract_data(f(vload.(V, gep.(ptrargs, vadd(i, 2W)))...))
133-
v₄ = extract_data(f(vload.(V, gep.(ptrargs, vadd(i, 3W)))...))
130+
v₁ = f(vload.(V, gep.(ptrargs, i ))...)
131+
v₂ = f(vload.(V, gep.(ptrargs, vadd(i, W)))...)
132+
v₃ = f(vload.(V, gep.(ptrargs, vadd(i, 2W)))...)
133+
v₄ = f(vload.(V, gep.(ptrargs, vadd(i, 3W)))...)
134134
vnoaliasstore!(gep(ptry, i ), v₁)
135135
vnoaliasstore!(gep(ptry, vadd(i, W)), v₂)
136136
vnoaliasstore!(gep(ptry, vadd(i, 2W)), v₃)
137137
vnoaliasstore!(gep(ptry, vadd(i, 3W)), v₄)
138138
end
139139
ii = Niter << Wsh
140140
while ii < N - (W - 1) # stops at 16 when
141-
vᵢ = extract_data(f(vload.(V, gep.(ptrargs, ii))...))
141+
vᵢ = f(vload.(V, gep.(ptrargs, ii))...)
142142
vnoaliasstore!(gep(ptry, ii), vᵢ)
143143
ii = vadd(ii, W)
144144
end
145145
if ii < N
146146
m = mask(T, N & (W - 1))
147-
vnoaliasstore!(gep(ptry, ii), extract_data(f(vload.(V, gep.(ptrargs, ii), m)...)), m)
147+
vnoaliasstore!(gep(ptry, ii), f(vload.(V, gep.(ptrargs, ii), m)...), m)
148148
end
149149
y
150150
end
@@ -158,8 +158,8 @@ Vectorized-`map!`, applying `f` to each element of `a` (or paired elements of `a
158158
and storing the result in `destination`.
159159
"""
160160
function vmap!(
161-
f::F, y::DenseArray{T}, args::Vararg{<:DenseArray{<:NativeTypes},A}
162-
) where {F,T<:NativeTypes,A}
161+
f::F, y::DenseArray{T}, args::Vararg{<:DenseArray{<:Base.HWReal},A}
162+
) where {F,T<:Base.HWReal,A}
163163
vmap_singlethread!(f, y, Val{false}(), args...)
164164
end
165165

@@ -170,8 +170,8 @@ end
170170
Like `vmap!` (see `vmap!`), but uses `Threads.@threads` for parallel execution.
171171
"""
172172
function vmapt!(
173-
f::F, y::DenseArray{T}, args::Vararg{<:DenseArray{<:NativeTypes},A}
174-
) where {F,T<:NativeTypes,A}
173+
f::F, y::DenseArray{T}, args::Vararg{<:DenseArray{<:Base.HWReal},A}
174+
) where {F,T<:Base.HWReal,A}
175175
vmap_multithreaded!(f, y, Val{false}(), args...)
176176
end
177177

@@ -232,8 +232,8 @@ BenchmarkTools.Trial:
232232
```
233233
"""
234234
function vmapnt!(
235-
f::F, y::DenseArray{T}, args::Vararg{<:DenseArray{<:NativeTypes},A}
236-
) where {F,T<:NativeTypes,A}
235+
f::F, y::DenseArray{T}, args::Vararg{<:DenseArray{<:Base.HWReal},A}
236+
) where {F,T<:Base.HWReal,A}
237237
vmap_singlethread!(f, y, Val{true}(), args...)
238238
end
239239

@@ -243,8 +243,8 @@ end
243243
Like `vmapnt!` (see `vmapnt!`), but uses `Threads.@threads` for parallel execution.
244244
"""
245245
function vmapntt!(
246-
f::F, y::DenseArray{T}, args::Vararg{<:DenseArray{<:NativeTypes},A}
247-
) where {F,T<:NativeTypes,A}
246+
f::F, y::DenseArray{T}, args::Vararg{<:DenseArray{<:Base.HWReal},A}
247+
) where {F,T<:Base.HWReal,A}
248248
vmap_multithreaded!(f, y, Val{true}(), args...)
249249
end
250250

0 commit comments

Comments
 (0)