Skip to content

Commit eb50656

Browse files
committed
Also use arg pointers for computing vmap vector width
1 parent 46d9fb5 commit eb50656

File tree

1 file changed

+41
-41
lines changed

1 file changed

+41
-41
lines changed

src/simdfunctionals/map.jl

Lines changed: 41 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -63,50 +63,50 @@ function vmap_singlethread!(
6363
start, N, ::Val{NonTemporal},
6464
ptrargs::Tuple{Vararg{AbstractStridedPointer,A}}
6565
) where {F, T, NonTemporal, A}
66-
i = convert(Int, start)
67-
V = VectorizationBase.pick_vector_width(T)
68-
W = unwrap(V)
69-
st = VectorizationBase.static_sizeof(T)
70-
UNROLL = 4
71-
LOG2UNROLL = 2
72-
while i < vsub_nsw(N, ((W << LOG2UNROLL) - 1))
73-
index = VectorizationBase.Unroll{1,W,UNROLL,1,W,0x0000000000000000}((i,))
74-
v = f(VectorizationBase.fmap(vload, ptrargs, index)...)
75-
if NonTemporal
76-
_vstore!(ptry, v, index, True(), True(), True(), register_size())
77-
else
78-
_vstore!(ptry, v, index, False(), True(), False(), register_size())
79-
end
80-
i = vadd_nw(i, StaticInt{UNROLL}() * W)
66+
i = convert(Int, start)
67+
V = VectorizationBase.pick_vector_width(promote_type(T, reduce(promote_type, map(eltype, ptrargs))))
68+
W = unwrap(V)
69+
st = VectorizationBase.static_sizeof(T)
70+
UNROLL = 4
71+
LOG2UNROLL = 2
72+
while i < vsub_nsw(N, ((W << LOG2UNROLL) - 1))
73+
index = VectorizationBase.Unroll{1,W,UNROLL,1,W,0x0000000000000000}((i,))
74+
v = f(VectorizationBase.fmap(vload, ptrargs, index)...)
75+
if NonTemporal
76+
_vstore!(ptry, v, index, True(), True(), True(), register_size())
77+
else
78+
_vstore!(ptry, v, index, False(), True(), False(), register_size())
8179
end
82-
# if Base.libllvm_version ≥ v"11" # this seems to be slower
83-
# Nm1 = vsub_nw(N, 1)
84-
# while i < N # stops at 16 when
85-
# m = mask(V, i, Nm1)
86-
# vnoaliasstore!(ptry, f(vload.(ptrargs, ((MM{W}(i),),), m)...), (MM{W}(i,),), m)
87-
# i = vadd_nw(i, W)
88-
# end
89-
# else
90-
while i < vsub_nsw(N, (W - 1)) # stops at 16 when
91-
vᵣ = f(map1(vload, ptrargs, (MM{W}(i),))...)
92-
if NonTemporal
93-
_vstore!(ptry, vᵣ, (MM{W}(i),), True(), True(), True(), register_size())
94-
else
95-
_vstore!(ptry, vᵣ, (MM{W}(i),), False(), True(), False(), register_size())
96-
end
97-
i = vadd_nw(i, W)
80+
i = vadd_nw(i, StaticInt{UNROLL}() * W)
81+
end
82+
# if Base.libllvm_version ≥ v"11" # this seems to be slower
83+
# Nm1 = vsub_nw(N, 1)
84+
# while i < N # stops at 16 when
85+
# m = mask(V, i, Nm1)
86+
# vnoaliasstore!(ptry, f(vload.(ptrargs, ((MM{W}(i),),), m)...), (MM{W}(i,),), m)
87+
# i = vadd_nw(i, W)
88+
# end
89+
# else
90+
while i < vsub_nsw(N, (W - 1)) # stops at 16 when
91+
vᵣ = f(map1(vload, ptrargs, (MM{W}(i),))...)
92+
if NonTemporal
93+
_vstore!(ptry, vᵣ, (MM{W}(i),), True(), True(), True(), register_size())
94+
else
95+
_vstore!(ptry, vᵣ, (MM{W}(i),), False(), True(), False(), register_size())
9896
end
99-
if i < N
100-
m = mask(T, N & (W - 1))
101-
vfinal = f(map1(vload, ptrargs, (MM{W}(i),), m)...)
102-
if NonTemporal
103-
_vstore!(ptry, vfinal, (MM{W}(i,),), m, True(), True(), False(), register_size())
104-
else
105-
_vstore!(ptry, vfinal, (MM{W}(i,),), m, False(), True(), False(), register_size())
106-
end
97+
i = vadd_nw(i, W)
98+
end
99+
if i < N
100+
m = mask(T, N & (W - 1))
101+
vfinal = f(map1(vload, ptrargs, (MM{W}(i),), m)...)
102+
if NonTemporal
103+
_vstore!(ptry, vfinal, (MM{W}(i,),), m, True(), True(), False(), register_size())
104+
else
105+
_vstore!(ptry, vfinal, (MM{W}(i,),), m, False(), True(), False(), register_size())
107106
end
108-
# end
109-
nothing
107+
end
108+
# end
109+
nothing
110110
end
111111

112112
abstract type AbstractVmapClosure{NonTemporal,F,D,N,A<:Tuple{Vararg{StridedPointer,N}}} <: Function end

0 commit comments

Comments
 (0)