@@ -8,20 +8,22 @@ function alignstores!(
8
8
args:: Vararg{<:DenseArray{<:Base.HWReal},A}
9
9
) where {F, T <: Base.HWReal , A}
10
10
N = length (y)
11
- ptry = pointer (y)
12
- ptrargs = pointer .(args)
13
- W = VectorizationBase. pick_vector_width (T)
11
+ ptry = VectorizationBase. zero_offsets (stridedpointer (y))
12
+ ptrargs = VectorizationBase. zero_offsets .(stridedpointer .(args))
14
13
V = VectorizationBase. pick_vector_width_val (T)
15
- @assert iszero (reinterpret (UInt, ptry) & (sizeof (T) - 1 )) " The destination vector (`dest`) must be aligned at least to `sizeof(eltype(dest))`."
16
- alignment = reinterpret (UInt, ptry) & (VectorizationBase. REGISTER_SIZE - 1 )
14
+ W = unwrap (V)
15
+ zero_index = MM {W} (Static (0 ))
16
+ uintptry = reinterpret (UInt, pointer (ptry))
17
+ @assert iszero (uintptry & (sizeof (T) - 1 )) " The destination vector (`dest`) must be aligned at least to `sizeof(eltype(dest))`."
18
+ alignment = uintptry & (VectorizationBase. REGISTER_SIZE - 1 )
17
19
if alignment > 0
18
20
i = reinterpret (Int, W - (alignment >>> VectorizationBase. intlog2 (sizeof (T))))
19
21
m = mask (T, i)
20
22
if N < i
21
23
m &= mask (T, N & (W - 1 ))
22
24
end
23
- vnoaliasstore! (ptry, f (vload .(V, ptrargs, m)... ), m)
24
- gep (ptry, i), gep .(ptrargs, i ), N - i
25
+ vnoaliasstore! (ptry, f (vload .(ptrargs, ((zero_index,),), m)... ), (zero_index, ), m)
26
+ gesp (ptry, (i,)), gesp .(ptrargs, ((i,),) ), N - i
25
27
else
26
28
ptry, ptrargs, N
27
29
end
@@ -32,46 +34,44 @@ function vmap_singlethread!(
32
34
:: Val{NonTemporal} ,
33
35
args:: Vararg{<:DenseArray{<:Base.HWReal},A}
34
36
) where {F,T <: Base.HWReal , A, NonTemporal}
35
- if NonTemporal
37
+ if NonTemporal # if stores into `y` aren't aligned, we'll get a crash
36
38
ptry, ptrargs, N = alignstores! (f, y, args... )
37
39
else
38
40
N = length (y)
39
- ptry = pointer (y )
40
- ptrargs = pointer .( args)
41
+ ptry = VectorizationBase . zero_offsets ( stridedpointer (y) )
42
+ ptrargs = VectorizationBase . zero_offsets .( stridedpointer .( args) )
41
43
end
42
44
i = 0
43
- W = VectorizationBase. pick_vector_width (T)
44
45
V = VectorizationBase. pick_vector_width_val (T)
46
+ W = unwrap (V)
47
+ st = VectorizationBase. static_sizeof (T)
48
+ zero_index = MM {W} (Static (0 ), st)
45
49
while i < N - ((W << 2 ) - 1 )
46
- v₁ = f (vload .(V, gep .(ptrargs, i ))... )
47
- v₂ = f (vload .(V, gep .(ptrargs, vadd (i, W)))... )
48
- v₃ = f (vload .(V, gep .(ptrargs, vadd (i, 2 W)))... )
49
- v₄ = f (vload .(V, gep .(ptrargs, vadd (i, 3 W)))... )
50
+
51
+ # vstore!(stridedpointer(B), VectorizationBase.VecUnroll((v1,v2,v3)), VectorizationBase.Unroll{AU,1,3,AV,W64,zero(UInt)}((i, j, k)))
52
+ # vload(stridedpointer(B), VectorizationBase.Unroll{1,1,4,1,W,0x0000000000000000}((i,)))
53
+
54
+ index = VectorizationBase. Unroll {1,1,4,1,W,0x0000000000000000} ((i,))
55
+ v = f (vload .(ptrargs, index)... )
50
56
if NonTemporal
51
- vstorent! (gep (ptry, i ), v₁)
52
- vstorent! (gep (ptry, vadd (i, W)), v₂)
53
- vstorent! (gep (ptry, vadd (i, 2 W)), v₃)
54
- vstorent! (gep (ptry, vadd (i, 3 W)), v₄)
57
+ vstorent! (ptry, v, index)
55
58
else
56
- vnoaliasstore! (gep (ptry, i ), v₁)
57
- vnoaliasstore! (gep (ptry, vadd (i, W)), v₂)
58
- vnoaliasstore! (gep (ptry, vadd (i, 2 W)), v₃)
59
- vnoaliasstore! (gep (ptry, vadd (i, 3 W)), v₄)
59
+ vnoaliasstore! (ptry, v, index)
60
60
end
61
61
i = vadd (i, 4 W)
62
62
end
63
63
while i < N - (W - 1 ) # stops at 16 when
64
- vᵢ = f (vload .(V, gep .( ptrargs, i ))... )
64
+ vᵣ = f (vload .(ptrargs, (( MM {W} (i),), ))... )
65
65
if NonTemporal
66
- vstorent! (gep ( ptry, i), vᵢ )
66
+ vstorent! (ptry, vᵣ, ( MM {W} ( i),) )
67
67
else
68
- vnoaliasstore! (gep ( ptry, i), vᵢ )
68
+ vnoaliasstore! (ptry, vᵣ, ( MM {W} ( i),) )
69
69
end
70
70
i = vadd (i, W)
71
71
end
72
72
if i < N
73
73
m = mask (T, N & (W - 1 ))
74
- vnoaliasstore! (gep ( ptry, i), f (vload .(V, gep .( ptrargs, i), m)... ), m)
74
+ vnoaliasstore! (ptry, f (vload .(ptrargs, (( MM {W} ( i),),), m)... ), ( MM {W} (i,), ), m)
75
75
end
76
76
y
77
77
end
@@ -89,25 +89,17 @@ function vmap_multithreaded!(
89
89
Wsh = Wshift + 2
90
90
Niter = N >>> Wsh
91
91
Base. Threads. @threads for j ∈ 0 : Niter- 1
92
- i = j << Wsh
93
- v₁ = f (vload .(V, gep .(ptrargs, i ))... )
94
- v₂ = f (vload .(V, gep .(ptrargs, vadd (i, W)))... )
95
- v₃ = f (vload .(V, gep .(ptrargs, vadd (i, 2 W)))... )
96
- v₄ = f (vload .(V, gep .(ptrargs, vadd (i, 3 W)))... )
97
- vstorent! (gep (ptry, i ), v₁)
98
- vstorent! (gep (ptry, vadd (i, W)), v₂)
99
- vstorent! (gep (ptry, vadd (i, 2 W)), v₃)
100
- vstorent! (gep (ptry, vadd (i, 3 W)), v₄)
92
+ index = VectorizationBase. Unroll {1,1,4,1,W,0x0000000000000000} ((j << Wsh,))
93
+ vstorent! (ptry, f (vload .(ptrargs, index)... ), index)
101
94
end
102
95
ii = Niter << Wsh
103
96
while ii < N - (W - 1 ) # stops at 16 when
104
- vᵢ = f (vload .(V, gep .(ptrargs, ii))... )
105
- vstorent! (gep (ptry, ii), vᵢ)
97
+ vstorent! (ptry, f (vload .(ptrargs, ((MM {W} (ii),),))... ), (MM {W} (ii),))
106
98
ii = vadd (ii, W)
107
99
end
108
100
if ii < N
109
101
m = mask (T, N & (W - 1 ))
110
- vnoaliasstore! (gep ( ptry, ii), f (vload .(V, gep .( ptrargs, ii), m)... ), m)
102
+ vnoaliasstore! (ptry, f (vload .(ptrargs, (( MM {W} ( ii),),), m)... ), ( MM {W} (ii), ), m)
111
103
end
112
104
y
113
105
end
@@ -118,33 +110,25 @@ function vmap_multithreaded!(
118
110
args:: Vararg{<:DenseArray{<:Base.HWReal},A}
119
111
) where {F,T,A}
120
112
N = length (y)
121
- ptry = pointer (y )
122
- ptrargs = pointer .( args)
113
+ ptry = VectorizationBase . zero_offsets ( stridedpointer (y) )
114
+ ptrargs = VectorizationBase . zero_offsets .( stridedpointer .( args) )
123
115
N > 0 || return y
124
116
W, Wshift = VectorizationBase. pick_vector_width_shift (T)
125
117
V = VectorizationBase. pick_vector_width_val (T)
126
118
Wsh = Wshift + 2
127
119
Niter = N >>> Wsh
128
120
Base. Threads. @threads for j ∈ 0 : Niter- 1
129
- i = j << Wsh
130
- v₁ = f (vload .(V, gep .(ptrargs, i ))... )
131
- v₂ = f (vload .(V, gep .(ptrargs, vadd (i, W)))... )
132
- v₃ = f (vload .(V, gep .(ptrargs, vadd (i, 2 W)))... )
133
- v₄ = f (vload .(V, gep .(ptrargs, vadd (i, 3 W)))... )
134
- vnoaliasstore! (gep (ptry, i ), v₁)
135
- vnoaliasstore! (gep (ptry, vadd (i, W)), v₂)
136
- vnoaliasstore! (gep (ptry, vadd (i, 2 W)), v₃)
137
- vnoaliasstore! (gep (ptry, vadd (i, 3 W)), v₄)
121
+ index = VectorizationBase. Unroll {1,1,4,1,W,0x0000000000000000} ((j << Wsh,))
122
+ vnoaliasstore! (ptry, f (vload .(ptrargs, index)... ), index)
138
123
end
139
124
ii = Niter << Wsh
140
125
while ii < N - (W - 1 ) # stops at 16 when
141
- vᵢ = f (vload .(V, gep .(ptrargs, ii))... )
142
- vnoaliasstore! (gep (ptry, ii), vᵢ)
126
+ vnoaliasstore! (ptry, f (vload .(ptrargs, ((MM {W} (ii),),))... ), (MM {W} (ii),))
143
127
ii = vadd (ii, W)
144
128
end
145
129
if ii < N
146
130
m = mask (T, N & (W - 1 ))
147
- vnoaliasstore! (gep ( ptry, ii), f (vload .(V, gep .( ptrargs, ii), m)... ), m)
131
+ vnoaliasstore! (ptry, f (vload .(ptrargs, (( MM {W} ( ii),),), m)... ), ( MM {W} (ii), ), m)
148
132
end
149
133
y
150
134
end
0 commit comments