@@ -63,50 +63,50 @@ function vmap_singlethread!(
63
63
start, N, :: Val{NonTemporal} ,
64
64
ptrargs:: Tuple{Vararg{AbstractStridedPointer,A}}
65
65
) where {F, T, NonTemporal, A}
66
- i = convert (Int, start)
67
- V = VectorizationBase. pick_vector_width (T)
68
- W = unwrap (V)
69
- st = VectorizationBase. static_sizeof (T)
70
- UNROLL = 4
71
- LOG2UNROLL = 2
72
- while i < vsub_nsw (N, ((W << LOG2UNROLL) - 1 ))
73
- index = VectorizationBase. Unroll {1,W,UNROLL,1,W,0x0000000000000000} ((i,))
74
- v = f (VectorizationBase. fmap (vload, ptrargs, index)... )
75
- if NonTemporal
76
- _vstore! (ptry, v, index, True (), True (), True (), register_size ())
77
- else
78
- _vstore! (ptry, v, index, False (), True (), False (), register_size ())
79
- end
80
- i = vadd_nw (i, StaticInt {UNROLL} () * W)
66
+ i = convert (Int, start)
67
+ V = VectorizationBase. pick_vector_width (promote_type (T, reduce (promote_type, map (eltype, ptrargs))))
68
+ W = unwrap (V)
69
+ st = VectorizationBase. static_sizeof (T)
70
+ UNROLL = 4
71
+ LOG2UNROLL = 2
72
+ while i < vsub_nsw (N, ((W << LOG2UNROLL) - 1 ))
73
+ index = VectorizationBase. Unroll {1,W,UNROLL,1,W,0x0000000000000000} ((i,))
74
+ v = f (VectorizationBase. fmap (vload, ptrargs, index)... )
75
+ if NonTemporal
76
+ _vstore! (ptry, v, index, True (), True (), True (), register_size ())
77
+ else
78
+ _vstore! (ptry, v, index, False (), True (), False (), register_size ())
81
79
end
82
- # if Base.libllvm_version ≥ v"11" # this seems to be slower
83
- # Nm1 = vsub_nw(N, 1)
84
- # while i < N # stops at 16 when
85
- # m = mask(V, i, Nm1 )
86
- # vnoaliasstore!(ptry, f(vload.(ptrargs, ((MM{W}(i),),), m)...), (MM{W}(i,),), m)
87
- # i = vadd_nw( i, W )
88
- # end
89
- # else
90
- while i < vsub_nsw (N, (W - 1 )) # stops at 16 when
91
- vᵣ = f ( map1 (vload, ptrargs, ( MM {W} (i),)) ... )
92
- if NonTemporal
93
- _vstore! (ptry, vᵣ , (MM {W} (i),), True (), True (), True (), register_size () )
94
- else
95
- _vstore! (ptry, vᵣ, (MM {W} (i),), False (), True (), False (), register_size ())
96
- end
97
- i = vadd_nw (i, W )
80
+ i = vadd_nw (i, StaticInt {UNROLL} () * W)
81
+ end
82
+ # if Base.libllvm_version ≥ v"11" # this seems to be slower
83
+ # Nm1 = vsub_nw(N, 1 )
84
+ # while i < N # stops at 16 when
85
+ # m = mask(V, i, Nm1 )
86
+ # vnoaliasstore!(ptry, f(vload.(ptrargs, ((MM{W}(i),),), m)...), (MM{W}(i,),), m)
87
+ # i = vadd_nw(i, W)
88
+ # end
89
+ # else
90
+ while i < vsub_nsw (N, (W - 1 )) # stops at 16 when
91
+ vᵣ = f ( map1 (vload, ptrargs , (MM {W} (i),)) ... )
92
+ if NonTemporal
93
+ _vstore! (ptry, vᵣ, (MM {W} (i),), True (), True (), True (), register_size ())
94
+ else
95
+ _vstore! (ptry, vᵣ, ( MM {W} (i),), False (), True (), False (), register_size () )
98
96
end
99
- if i < N
100
- m = mask (T, N & (W - 1 ))
101
- vfinal = f (map1 (vload, ptrargs, (MM {W} (i),), m)... )
102
- if NonTemporal
103
- _vstore! (ptry, vfinal, (MM {W} (i,),), m, True (), True (), False (), register_size ())
104
- else
105
- _vstore! (ptry, vfinal, (MM {W} (i,),), m, False (), True (), False (), register_size ())
106
- end
97
+ i = vadd_nw (i, W)
98
+ end
99
+ if i < N
100
+ m = mask (T, N & (W - 1 ))
101
+ vfinal = f (map1 (vload, ptrargs, (MM {W} (i),), m)... )
102
+ if NonTemporal
103
+ _vstore! (ptry, vfinal, (MM {W} (i,),), m, True (), True (), False (), register_size ())
104
+ else
105
+ _vstore! (ptry, vfinal, (MM {W} (i,),), m, False (), True (), False (), register_size ())
107
106
end
108
- # end
109
- nothing
107
+ end
108
+ # end
109
+ nothing
110
110
end
111
111
112
112
abstract type AbstractVmapClosure{NonTemporal,F,D,N,A<: Tuple{Vararg{StridedPointer,N}} } <: Function end
0 commit comments