1
1
using LoopVectorization
2
2
using LinearAlgebra
3
+ using OffsetArrays
3
4
using Test
4
5
6
+ if ! isdefined (@__MODULE__ , Symbol (" @_avx" ))
7
+ include (" testsetup.jl" )
8
+ end
9
+
5
10
@testset " Miscellaneous" begin
6
11
# T = Float32
7
12
# Unum, Tnum = LoopVectorization.register_count() == 16 ? (1, 6) : (1, 8)
@@ -48,7 +53,7 @@ using Test
48
53
s += t * y[n]
49
54
end );
50
55
ls = LoopVectorization. loopset (q);
51
-
56
+
52
57
function dot3avx24 (x, A, y)
53
58
M, N = size (A)
54
59
s = zero (promote_type (eltype (x), eltype (A), eltype (y)))
@@ -157,7 +162,7 @@ using Test
157
162
# @test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :j, Symbol("##undefined##"), :j, 8, -1)
158
163
# @test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :j, :i, :j, 2, 6)
159
164
# end
160
-
165
+
161
166
function myvar! (s², A, x̄)
162
167
@. s² = 0
163
168
@inbounds for i ∈ 1 : size (A,2 )
@@ -197,7 +202,7 @@ using Test
197
202
Z[i, j] = acc
198
203
end
199
204
end
200
- function setcolumstovectorplus100avx! (Z:: AbstractArray{T} , A) where {T}
205
+ function setcolumstovectorplus100avx! (Z:: AbstractArray{T} , A) where {T}
201
206
@turbo for i = axes (A,1 ), j = axes (Z,2 )
202
207
acc = zero (T)
203
208
acc = acc + A[i] + (26 + 74 )
@@ -508,7 +513,7 @@ using Test
508
513
function test_bit_shiftavx (counter)
509
514
accu = zero (first (counter))
510
515
@turbo for i ∈ eachindex (counter)
511
- accu += counter[i] << (- 3 * 4 + 13 )
516
+ accu += counter[i] << (- 3 * 4 + 13 )
512
517
end
513
518
accu
514
519
end
892
897
setcolumstovectorplus100! (A, x)
893
898
setcolumstovectorplus100avx! (A2, x)
894
899
@test A == A2
895
-
900
+
896
901
maxdeg = 20 ; nbasis = 1_000 ; dim = 15 ;
897
902
r = T == Float32 ? (Int32 (1 ): Int32 (maxdeg+ 1 )) : (1 : maxdeg+ 1 )
898
903
basis = rand (r, (dim, nbasis));
@@ -1033,7 +1038,7 @@ end
1033
1038
@test c_re_1 ≈ c_re_2
1034
1039
1035
1040
@test loopinductvardivision (X1) ≈ loopinductvardivisionavx (X2)
1036
-
1041
+
1037
1042
mh = (
1038
1043
Wt_D_W = Matrix {T} (undef, 181 , 181 ),
1039
1044
Wt = rand (T, 181 , 191 ),
@@ -1050,7 +1055,7 @@ end
1050
1055
@test maxavx! (R, Q, true ) == max .(vec (maximum (Q, dims= (2 ,3 ))), Rc)
1051
1056
1052
1057
@test manyreturntest (Q) ≈ manyreturntestavx (Q)
1053
-
1058
+
1054
1059
U0 = randn (T, 15 , 17 ); E0 = randn (T, 17 );
1055
1060
U1, E1 = splitintonoloop_reference (copy (U0), copy (E0));
1056
1061
U2, E2 = splitintonoloop (copy (U0), copy (E0));
@@ -1064,7 +1069,7 @@ end
1064
1069
@test all (isequal (81 ), powcseliteral! (E0))
1065
1070
@test all (isequal (81 ), powcsesymbol! (E3))
1066
1071
1067
-
1072
+
1068
1073
@test isapprox (
1069
1074
maybe_const_issue144! (zeros (T, 3 ,4 ), (value= one (T),), collect (reshape (1 : 12 , 3 ,4 )), ones (T, 4 )),
1070
1075
maybe_const_issue144_avx! (zeros (T,3 ,4 ), (value= one (T),), collect (reshape (1 : 12 , 3 ,4 )), ones (T,4 )),
@@ -1144,13 +1149,48 @@ end
1144
1149
s
1145
1150
end
1146
1151
1147
- for T ∈ (Float32, Float64)
1152
+ function smoothdim_kernel_tile! (out, z, src:: AbstractArray , kernel:: AbstractVector , Rpre:: CartesianIndices , axout_tile, Rpost:: CartesianIndices )
1153
+ # z is an "overflow-safe zero". This is needed in cases where both `src` and `kernel` have eltypes vulnerable to arithmetic overflow.
1154
+ axkernel = axes (kernel, 1 )
1155
+ for Ipost in Rpost
1156
+ for i in axout_tile
1157
+ for Ipre in Rpre
1158
+ tmp = convert (eltype (out), z)
1159
+ for j in axkernel
1160
+ tmp += oftype (z, src[Ipre,i+ j,Ipost])* kernel[j]
1161
+ end
1162
+ out[Ipre,i,Ipost] = tmp
1163
+ end
1164
+ end
1165
+ end
1166
+ out
1167
+ end
1168
+ function smoothdim_kernel_tile_avx! (out, z, src:: AbstractArray , kernel:: AbstractVector , Rpre:: CartesianIndices , axout_tile, Rpost:: CartesianIndices )
1169
+ axkernel = axes (kernel, 1 )
1170
+ zout = convert (eltype (out), z)
1171
+ for Ipost in Rpost
1172
+ for i in axout_tile
1173
+ @turbo for Ipre in Rpre
1174
+ tmp = zout
1175
+ # tmp = convert(eltype(out), z) # failing to hoist this leads to an "UndefVarError: tmp not defined"
1176
+ for j in axkernel
1177
+ tmp += oftype (z, src[Ipre,i+ j,Ipost])* kernel[j]
1178
+ end
1179
+ out[Ipre,i,Ipost] = tmp
1180
+ end
1181
+ end
1182
+ end
1183
+ out
1184
+ end
1185
+
1186
+
1187
+ for T ∈ (UInt8,Float32, Float64)
1148
1188
@testset " Mixed CartesianIndex/Int indexing" begin
1149
1189
@show T, @__LINE__
1150
1190
# A demo similar to the exponential filtering demo from https://julialang.org/blog/2016/02/iteration/,
1151
1191
# but with no loop-carried dependency.
1152
1192
1153
- # s = dest1;
1193
+ # s = dest1;
1154
1194
# ifirst, ilast = first(axes(x, d)), last(axes(x, d))
1155
1195
# ls = LoopVectorization.@turbo_debug for Ipost in Rpost, i = ifirst:ilast, Ipre in Rpre
1156
1196
# xi = x[Ipre, i, Ipost]
@@ -1160,8 +1200,8 @@ end
1160
1200
# LoopVectorization.choose_order(ls);
1161
1201
1162
1202
M = 11 ;
1163
- x = rand (M,M,M,M,M);
1164
- dest1, dest2 = similar (x) , similar (x);
1203
+ x = rand (T, M,M,M,M,M);
1204
+ dest1, dest2 = similar (x, float (T)) , similar (x, float (T) );
1165
1205
α = 0.3
1166
1206
for d = 1 : ndims (x)
1167
1207
# @show d
@@ -1173,11 +1213,28 @@ end
1173
1213
fill! (dest2, NaN ); smoothdim_ifelse_avx! (dest2, x, α, Rpre, axes (x, d), Rpost);
1174
1214
@test dest1 ≈ dest2
1175
1215
end
1216
+
1217
+ # Mimic of ImageFiltering/ArrayFiltering's separable filters
1218
+ σ = 3
1219
+ ax = OffsetArray (- 6 : 6 , - 6 : 6 )
1220
+ kernel = float (T)[exp (- i^ 2 / (2 * σ^ 2 )) for i in ax] # 1-d Gaussian kernel
1221
+ srcax = 1 + firstindex (ax): M+ lastindex (ax)
1222
+ Mpad = length (srcax)
1223
+ src = OffsetArray (rand (T,Mpad,Mpad,Mpad,Mpad,Mpad),srcax,srcax,srcax,srcax,srcax)
1224
+ dest1 = Array {float(T),5} (undef,M,M,M,M,M)
1225
+ dest2 = similar (dest1)
1226
+ for d = 1 : ndims (src)
1227
+ Rpre = CartesianIndices (axes (dest1)[1 : d- 1 ]);
1228
+ Rpost = CartesianIndices (axes (dest1)[d+ 1 : end ]);
1229
+ smoothdim_kernel_tile! ( dest1, float (zero (T)), src, kernel, Rpre, axes (dest1, d), Rpost);
1230
+ smoothdim_kernel_tile_avx! (dest2, float (zero (T)), src, kernel, Rpre, axes (dest2, d), Rpost);
1231
+ @test dest1 ≈ dest2
1232
+ end
1176
1233
end
1177
1234
end
1178
1235
1179
1236
1180
- function mul1! (y:: Vector{T} , A:: Matrix{UInt8} , x:: Vector{T} ) where T
1237
+ function mul1! (y:: Vector{T} , A:: Matrix{UInt8} , x:: Vector{T} ) where T
1181
1238
packedstride = size (A, 1 )
1182
1239
m, n = size (A)
1183
1240
@turbo for j ∈ eachindex (x)
@@ -1190,7 +1247,7 @@ end
1190
1247
end
1191
1248
y
1192
1249
end
1193
- function mul2! (y:: Vector{T} , A:: Matrix{UInt8} , x:: Vector{T} ) where T
1250
+ function mul2! (y:: Vector{T} , A:: Matrix{UInt8} , x:: Vector{T} ) where T
1194
1251
packedstride = size (A, 1 )
1195
1252
m, n = size (A)
1196
1253
for j ∈ eachindex (x)
@@ -1318,7 +1375,7 @@ end
1318
1375
A0 = Vector {Float64} (undef, i- 1 ); A1 = similar (A0);
1319
1376
@test issue_257! (A0,G) ≈ issue_257_avx! (A1,G)
1320
1377
end
1321
-
1378
+
1322
1379
end
1323
1380
1324
1381
0 commit comments