Try to improve performance for simple single loops when masks aren't needed, and revamp benchmarks.

chriselrod · chriselrod · commit 6185381dcf1b · 2020-05-29T08:25:35.000-04:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <elrodc@gmail.com>"]
-version = "0.8.1"
+version = "0.8.2"
 
 [deps]
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
@@ -18,7 +18,7 @@ OffsetArrays = "1"
 SIMDPirates = "0.8.4"
 SLEEFPirates = "0.5"
 UnPack = "0,1"
-VectorizationBase = "0.12.2"
+VectorizationBase = "0.12.4"
 julia = "1.1"
 
 [extras]
diff --git a/benchmark/benchmarkflops.jl b/benchmark/benchmarkflops.jl
@@ -61,10 +61,10 @@ function matmul_bench!(br, C, A, B, i)
     @assert C ≈ Cblas "Fort builtin gemm wrong?"; fill!(C, NaN)
     br[10,i] = n_gflop / @belapsed ifgemm_builtin!($C, $A, $B)
     @assert C ≈ Cblas "ifort builtin gemm wrong?"; fill!(C, NaN)
-    br[11,i] = n_gflop / @belapsed mul!($C, $A, $B);
-    fill!(C, NaN)
+    br[11,i] = n_gflop / @belapsed dgemmopenblas!($C, $A, $B);
+    @assert C ≈ Cblas "OpenBLAS gemm wrong?"
     br[12,i] = n_gflop / @belapsed dgemmmkl!($C, $A, $B)
-    @assert C ≈ Cblas "MKL JIT gemm wrong?"
+    @assert C ≈ Cblas "MKL gemm wrong?"
     # br[12,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
 end
 function A_mul_B_bench!(br, s, i)
@@ -109,7 +109,7 @@ blastests() = [
     "GFortran", "icc", "ifort",
     "g++ & Eigen-3", "clang++ & Eigen-3",
     "GFortran-builtin", "ifort-builtin",
-    BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "MKL"
+    "OpenBLAS", "MKL"
 ]    
 
 function benchmark_AmulB(sizes)
@@ -160,7 +160,7 @@ function dot_bench!(br, s, i)
     br[9,i] = n_gflop / @belapsed dot($a, $b)
 end
 function benchmark_dot(sizes)
-    tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS"]
+    tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", "OpenBLAS"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> dot_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -189,7 +189,7 @@ function selfdot_bench!(br, s, i)
     br[9,i] = n_gflop / @belapsed dot($a, $a)
 end
 function benchmark_selfdot(sizes)
-    tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS"]
+    tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", "OpenBLAS"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> selfdot_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -222,7 +222,8 @@ function gemv_bench!(br, x, A, y, i)
     @assert x ≈ xblas "Fort wrong?"; fill!(x, NaN);
     br[10,i] = n_gflop / @belapsed ifgemv_builtin!($x, $A, $y)
     @assert x ≈ xblas "ifort wrong?"; fill!(x, NaN);
-    br[11,i] = n_gflop / @belapsed mul!($x, $A, $y)
+    br[11,i] = n_gflop / @belapsed dgemvopenblas!($x, $A, $y)
+    @assert x ≈ xblas "gemvopenblas wrong?"; fill!(x, NaN);
     br[12,i] = n_gflop / @belapsed dgemvmkl!($x, $A, $y)
     @assert x ≈ xblas "gemvmkl wrong?"; fill!(x, NaN);
 end
@@ -316,7 +317,7 @@ function sse_bench!(br, s, i)
     br[9,i] = n_gflop / @belapsed sse!($Xβ, $y, $X, $β)
 end
 function benchmark_sse(sizes)
-    tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS"]
+    tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", "OpenBLAS"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> sse_bench!(sm, is[2], is[1]), enumerate(sizes))
diff --git a/benchmark/driver.jl b/benchmark/driver.jl
@@ -21,6 +21,7 @@ end
 
 # sizes = 23:23
 sizes = 256:-1:2
+longsizes = 512:-1:2
 
 logdettriangle_bench = benchmark_logdettriangle(sizes); println("logdet(LowerTriangular(A)) benchmark results:"); println(logdettriangle_bench)
 dot3_bench = benchmark_dot3(sizes); println("x' * A * y benchmark results:"); println(dot3_bench)
@@ -37,8 +38,8 @@ filter2d_dynamic_bench = benchmark_filter2ddynamic(sizes); println("Benchmark re
 filter2d_3x3_bench = benchmark_filter2d3x3(sizes); println("Benchmark results for statically sized 3x3 convolution:"); println(filter2d_3x3_bench)
 filter2d_unrolled_bench = benchmark_filter2dunrolled(sizes); println("Benchmark results for unrolled 3x3 convolution:"); println(filter2d_unrolled_bench)
 
-dot_bench = benchmark_dot(sizes); println("a' * b benchmark results:"); println(dot_bench)
-selfdot_bench = benchmark_selfdot(sizes); println("a' * a benchmark results:"); println(selfdot_bench)
+dot_bench = benchmark_dot(longsizes); println("a' * b benchmark results:"); println(dot_bench)
+selfdot_bench = benchmark_selfdot(longsizes); println("a' * a benchmark results:"); println(selfdot_bench)
 sse_bench = benchmark_sse(sizes); println("Benchmark resutls of summing squared error:"); println(sse_bench)
 aplusBc_bench = benchmark_aplusBc(sizes); println("Benchmark results of a .+ B .* c':"); println(aplusBc_bench)
 AplusAt_bench = benchmark_AplusAt(sizes); println("Benchmark results of A * A':"); println(AplusAt_bench)
diff --git a/benchmark/loadsharedlibs.jl b/benchmark/loadsharedlibs.jl
@@ -1,16 +1,9 @@
-using LinearAlgebra, LoopVectorization
+using LinearAlgebra, LoopVectorization, Libdl
 using LoopVectorization.VectorizationBase: REGISTER_SIZE
 
 # const LOOPVECBENCHDIR = joinpath(pkgdir(LoopVectorization), "benchmark")
 include(joinpath(LOOPVECBENCHDIR, "looptests.jl"))
 
-const LIBCTEST = joinpath(LOOPVECBENCHDIR, "libctests.so")
-const LIBFTEST = joinpath(LOOPVECBENCHDIR, "libftests.so")
-const LIBICTEST = joinpath(LOOPVECBENCHDIR, "libictests.so")
-const LIBIFTEST = joinpath(LOOPVECBENCHDIR, "libiftests.so")
-const LIBEIGENTEST = joinpath(LOOPVECBENCHDIR, "libetest.so")
-const LIBIEIGENTEST = joinpath(LOOPVECBENCHDIR, "libietest.so")
-const LIBDIRECTCALLJIT = joinpath(LOOPVECBENCHDIR, "libdcjtest.so")
 
 # requires Clang with polly to build
 cfile = joinpath(LOOPVECBENCHDIR, "looptests.c")
@@ -44,43 +37,108 @@ if !isfile(LIBIEIGENTEST) || mtime(eigenfile) > mtime(LIBIEIGENTEST)
     # run(`icpc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)
 end
 
-MKL_ROOT = "/home/chriselrod/intel"
-directcalljitfile = joinpath(LOOPVECBENCHDIR, "directcalljit.f90")
-if !isfile(LIBDIRECTCALLJIT) || mtime(directcalljitfile) > mtime(LIBDIRECTCALLJIT)
-    run(`ifort -fast -DMKL_DIRECT_CALL_SEQ_JIT -fpp -qopt-zmm-usage=high -Wl,--start-group $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_intel_lp64.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_sequential.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_core.a")) -Wl,--end-group -I$(joinpath(MKL_ROOT, "mkl/include")) -I$(joinpath(MKL_ROOT, "compilers_and_libraries_2020.1.217/linux/mkl/include/intel64/lp64")) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
-    # run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -Wl,--start-group $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_intel_lp64.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_sequential.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_core.a")) -Wl,--end-group -I$(joinpath(MKL_ROOT, "mkl/include")) -I$(joinpath(MKL_ROOT, "compilers_and_libraries_2020.1.217/linux/mkl/include/intel64/lp64")) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
+# MKL_ROOT = "/home/chriselrod/intel"
+# directcalljitfile = joinpath(LOOPVECBENCHDIR, "directcalljit.f90")
+# if !isfile(LIBDIRECTCALLJIT) || mtime(directcalljitfile) > mtime(LIBDIRECTCALLJIT)
+#     run(`ifort -fast -DMKL_DIRECT_CALL_SEQ_JIT -fpp -qopt-zmm-usage=high -Wl,--start-group $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_intel_lp64.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_sequential.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_core.a")) -Wl,--end-group -I$(joinpath(MKL_ROOT, "mkl/include")) -I$(joinpath(MKL_ROOT, "compilers_and_libraries_2020.1.217/linux/mkl/include/intel64/lp64")) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
+#     # run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -Wl,--start-group $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_intel_lp64.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_sequential.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_core.a")) -Wl,--end-group -I$(joinpath(MKL_ROOT, "mkl/include")) -I$(joinpath(MKL_ROOT, "compilers_and_libraries_2020.1.217/linux/mkl/include/intel64/lp64")) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
     
-    # run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
-end
+#     # run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
+# end
 
-istransposed(x) = false
-istransposed(x::Adjoint) = true
-istransposed(x::Transpose) = true
-"""
-If transposed, requires them to be square
-"""
+
+const LIBCTEST = joinpath(LOOPVECBENCHDIR, "libctests.so")
+const LIBFTEST = joinpath(LOOPVECBENCHDIR, "libftests.so")
+const LIBICTEST = joinpath(LOOPVECBENCHDIR, "libictests.so")
+const LIBIFTEST = joinpath(LOOPVECBENCHDIR, "libiftests.so")
+const LIBEIGENTEST = joinpath(LOOPVECBENCHDIR, "libetest.so")
+const LIBIEIGENTEST = joinpath(LOOPVECBENCHDIR, "libietest.so")
+
+using MKL_jll, OpenBLAS_jll
+
+const libMKL = Libdl.dlopen(MKL_jll.libmkl_rt)
+const DGEMM_MKL = Libdl.dlsym(libMKL, :dgemm)
+const DGEMV_MKL = Libdl.dlsym(libMKL, :dgemv)
+const MKL_SET_NUM_THREADS = Libdl.dlsym(libMKL, :MKL_Set_Num_Threads)
+
+const libOpenBLAS = Libdl.dlopen(OpenBLAS_jll.libopenblas)
+const DGEMM_OpenBLAS = Libdl.dlsym(libOpenBLAS, :dgemm_64_)
+const DGEMV_OpenBLAS = Libdl.dlsym(libOpenBLAS, :dgemv_64_)
+const OPENBLAS_SET_NUM_THREADS = Libdl.dlsym(libOpenBLAS, :openblas_set_num_threads64_)
+
+istransposed(x) = 'N'
+istransposed(x::Adjoint{<:Real}) = 'T'
+istransposed(x::Adjoint) = 'C'
+istransposed(x::Transpose) = 'T'
 function dgemmmkl!(C::AbstractMatrix{Float64}, A::AbstractMatrix{Float64}, B::AbstractMatrix{Float64})
+    transA = istransposed(A)
+    transB = istransposed(B)
+    M, N = size(C); K = size(B, 1)
+    M32 = M % Int32
+    K32 = K % Int32
+    N32 = N % Int32
+    pA = parent(A); pB = parent(B)
+    ldA = stride(pA, 2) % Int32
+    ldB = stride(pB, 2) % Int32
+    ldC = stride(C, 2) % Int32
+    α = 1.0
+    β = 0.0
+    ccall(
+        DGEMM_MKL, Cvoid,
+        (Ref{UInt8}, Ref{UInt8}, Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ref{Float64}, Ref{Int32}, Ref{Float64}, Ref{Int32}, Ref{Float64}, Ref{Float64}, Ref{Int32}),
+        transA, transB, M32, N32, K32, α, pA, ldA, pB, ldB, β, C, ldC
+    )
+end
+function dgemmopenblas!(C::AbstractMatrix{Float64}, A::AbstractMatrix{Float64}, B::AbstractMatrix{Float64})
+    transA = istransposed(A)
+    transB = istransposed(B)
     M, N = size(C); K = size(B, 1)
+    pA = parent(A); pB = parent(B)
+    ldA = stride(pA, 2)
+    ldB = stride(pB, 2)
+    ldC = stride(C, 2)
+    α = 1.0
+    β = 0.0
     ccall(
-        (:dgemmjit, LIBDIRECTCALLJIT), Cvoid,
-        (Ptr{Float64},Ptr{Float64},Ptr{Float64},Ref{Int},Ref{Int},Ref{Int},Ref{Bool},Ref{Bool}),
-        parent(C), parent(A), parent(B),
-        Ref(M), Ref(K), Ref(N),
-        Ref(istransposed(A)), Ref(istransposed(B))
+        DGEMM_OpenBLAS, Cvoid,
+        (Ref{UInt8}, Ref{UInt8}, Ref{Int64}, Ref{Int64}, Ref{Int64}, Ref{Float64}, Ref{Float64}, Ref{Int64}, Ref{Float64}, Ref{Int64}, Ref{Float64}, Ref{Float64}, Ref{Int64}),
+        transA, transB, M, N, K, α, pA, ldA, pB, ldB, β, C, ldC
     )
 end
-mkl_set_num_threads(N::Integer) = ccall((:set_num_threads, LIBDIRECTCALLJIT), Cvoid, (Ref{UInt32},), Ref(N % UInt32))
+mkl_set_num_threads(N::Integer) = ccall(MKL_SET_NUM_THREADS, Cvoid, (Ref{UInt32},), Ref(N % UInt32))
 mkl_set_num_threads(1)
-"""
-If transposed, requires them to be square
-"""
+openblas_set_num_threads(N::Integer) = ccall(OPENBLAS_SET_NUM_THREADS, Cvoid, (Ref{Int64},), Ref(N))
+openblas_set_num_threads(1)
 function dgemvmkl!(y::AbstractVector{Float64}, A::AbstractMatrix{Float64}, x::AbstractVector{Float64})
-    M, N = size(A);
+    transA = istransposed(A)
+    pA = parent(A)
+    M, N = size(pA)
+    M32 = M % Int32
+    N32 = N % Int32
+    ldA = stride(pA, 2) % Int32
+    incx = LinearAlgebra.stride1(x) % Int32
+    incy = LinearAlgebra.stride1(y) % Int32
+    α = 1.0
+    β = 0.0
+    ccall(
+        DGEMV_MKL, Cvoid,
+        (Ref{UInt8}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ref{Float64}, Ref{Int32}, Ref{Float64}, Ref{Int32}, Ref{Float64}, Ref{Float64}, Ref{Int32}),
+        transA, M32, N32, α, A, ldA, x, incx, β, y, incy
+    )
+end
+function dgemvopenblas!(y::AbstractVector{Float64}, A::AbstractMatrix{Float64}, x::AbstractVector{Float64})
+    transA = istransposed(A)
+    pA = parent(A)
+    M, N = size(pA)
+    ldA = stride(pA, 2)
+    incx = LinearAlgebra.stride1(x)
+    incy = LinearAlgebra.stride1(y)
+    α = 1.0
+    β = 0.0
     ccall(
-        (:dgemvjit, LIBDIRECTCALLJIT), Cvoid,
-        (Ptr{Float64},Ptr{Float64},Ptr{Float64},Ref{Int},Ref{Int},Ref{Bool}),
-        parent(y), parent(A), parent(x),
-        Ref(M), Ref(N), Ref(istransposed(A))
+        DGEMV_OpenBLAS, Cvoid,
+        (Ref{UInt8}, Ref{Int64}, Ref{Int64}, Ref{Float64}, Ref{Float64}, Ref{Int64}, Ref{Float64}, Ref{Int64}, Ref{Float64}, Ref{Float64}, Ref{Int64}),
+        transA, M, N, α, A, ldA, x, incx, β, y, incy
     )
 end
 
diff --git a/benchmark/looptests.c b/benchmark/looptests.c
@@ -248,8 +248,8 @@ void filter2d(double* restrict B, double* restrict A, double* restrict K, long M
   for (long na = offset; na < N-offset; na++){
     for (long ma = offset; ma < M-offset; ma++){
       double tmp = 0.0;
-      for (long nk = -offset; nk < offset + 1; nk++){
-	for (long mk = -offset; mk < offset + 1; mk++){
+      for (long mk = -offset; mk < offset + 1; mk++){
+	for (long nk = -offset; nk < offset + 1; nk++){
 	  tmp += A[(ma+mk) + (na+nk)*M] * K[(mk+offset) + (nk+offset)*(2*offset+1)];
 	}
       }
@@ -262,8 +262,8 @@ void filter2d3x3(double* restrict B, double* restrict A, double* restrict K, lon
   for (long na = offset; na < N-offset; na++){
     for (long ma = offset; ma < M-offset; ma++){
       double tmp = 0.0;
-      for (long nk = -offset; nk < offset + 1; nk++){
-	for (long mk = -offset; mk < offset + 1; mk++){
+      for (long mk = -offset; mk < offset + 1; mk++){
+	for (long nk = -offset; nk < offset + 1; nk++){
 	  tmp += A[(ma+mk) + (na+nk)*M] * K[(mk+offset) + (nk+offset)*(2*offset+1)];
 	}
       }
diff --git a/benchmark/looptests.f90 b/benchmark/looptests.f90
@@ -324,8 +324,10 @@ subroutine filter2d(B, A, K, Ma, Na, offset) BIND(C, name="filter2d")
       real(C_double) :: tmp
       do concurrent(mma = 1+offset:Ma-offset, nna = 1+offset:Na-offset)
          tmp = 0
-         do concurrent(nnk = -offset:offset, mmk = -offset:offset)
-            tmp = tmp + A(mma + mmk, nna + nnk) * K(mmk, nnk)
+         do mmk = -offset,offset
+            do nnk = -offset,offset
+               tmp = tmp + A(mma + mmk, nna + nnk) * K(mmk, nnk)
+            end do
          end do
          B(mma,nna) = tmp
       end do
@@ -340,8 +342,10 @@ subroutine filter2d3x3(B, A, K, Ma, Na) BIND(C, name="filter2d3x3")
       real(C_double) :: tmp
       do concurrent(mma = 1+offset:Ma-offset, nna = 1+offset:Na-offset)
          tmp = 0
-         do concurrent(nnk = -offset:offset, mmk = -offset:offset)
-            tmp = tmp + A(mma + mmk, nna + nnk) * K(mmk, nnk)
+         do mmk = -offset,offset
+            do nnk = -offset,offset
+               tmp = tmp + A(mma + mmk, nna + nnk) * K(mmk, nnk)
+            end do
          end do
          B(mma,nna) = tmp
       end do
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
@@ -1,5 +1,9 @@
 module LoopVectorization
 
+if (!isnothing(get(ENV, "TRAVIS_BRANCH", nothing)) || !isnothing(get(ENV, "APPVEYOR", nothing))) && isdefined(Base, :Experimental) && isdefined(Base.Experimental, Symbol("@optlevel"))
+    @eval Base.Experimental.@optlevel 1
+end
+
 using VectorizationBase, SIMDPirates, SLEEFPirates, UnPack, OffsetArrays
 using VectorizationBase: REGISTER_SIZE, extract_data, num_vector_load_expr,
     mask, masktable, pick_vector_width_val, valmul, valrem, valmuladd, valmulsub, valadd, valsub, _MM,
diff --git a/src/lower_store.jl b/src/lower_store.jl
@@ -43,13 +43,19 @@ end
 # variable_name(op::Operation, suffix) = Symbol(mangledvar(op), suffix, :_)
 # # variable_name(op::Operation, suffix, u::Int) = (n = variable_name(op, suffix); u < 0 ? n : Symbol(n, u))
 function reduce_range!(q::Expr, toreduct::Symbol, instr::Instruction, Uh::Int, Uh2::Int)
-    for u ∈ Uh:Uh2-1
-        tru = Symbol(toreduct, u - Uh)
-        push!(q.args, Expr(:(=), tru, Expr(instr, tru, Symbol(toreduct, u))))
-    end
-    for u ∈ 2Uh:Uh2-1
-        tru = Symbol(toreduct, u - 2Uh)
-        push!(q.args, Expr(:(=), tru, Expr(instr, tru, Symbol(toreduct, u))))
+    if 2Uh == Uh2
+        for u ∈ 0:2:Uh2-1
+            push!(q.args, Expr(:(=), Symbol(toreduct, (u>>>1)), Expr(instr, Symbol(toreduct, u), Symbol(toreduct, u + 1))))
+        end
+    else
+        for u ∈ Uh:Uh2-1
+            tru = Symbol(toreduct, u - Uh)
+            push!(q.args, Expr(:(=), tru, Expr(instr, tru, Symbol(toreduct, u))))
+        end
+        for u ∈ 2Uh:Uh2-1
+            tru = Symbol(toreduct, u - 2Uh)
+            push!(q.args, Expr(:(=), tru, Expr(instr, tru, Symbol(toreduct, u))))
+        end
     end
 end
 function reduce_range!(q::Expr, ls::LoopSet, Ulow::Int, Uhigh::Int)
diff --git a/src/lowering.jl b/src/lowering.jl

Original file line number	Diff line number	Diff line change
`@@ -248,8 +248,8 @@ void filter2d(double* restrict B, double* restrict A, double* restrict K, long M`
`248`	`248`	`for (long na = offset; na < N-offset; na++){`
`249`	`249`	`for (long ma = offset; ma < M-offset; ma++){`
`250`	`250`	`double tmp = 0.0;`
`251`		`- for (long nk = -offset; nk < offset + 1; nk++){`
`252`		`- for (long mk = -offset; mk < offset + 1; mk++){`
	`251`	`+ for (long mk = -offset; mk < offset + 1; mk++){`
	`252`	`+ for (long nk = -offset; nk < offset + 1; nk++){`
`253`	`253`	`tmp += A[(ma+mk) + (na+nk)M] K[(mk+offset) + (nk+offset)(2offset+1)];`
`254`	`254`	`}`
`255`	`255`	`}`
`@@ -262,8 +262,8 @@ void filter2d3x3(double* restrict B, double* restrict A, double* restrict K, lon`
`262`	`262`	`for (long na = offset; na < N-offset; na++){`
`263`	`263`	`for (long ma = offset; ma < M-offset; ma++){`
`264`	`264`	`double tmp = 0.0;`
`265`		`- for (long nk = -offset; nk < offset + 1; nk++){`
`266`		`- for (long mk = -offset; mk < offset + 1; mk++){`
	`265`	`+ for (long mk = -offset; mk < offset + 1; mk++){`
	`266`	`+ for (long nk = -offset; nk < offset + 1; nk++){`
`267`	`267`	`tmp += A[(ma+mk) + (na+nk)M] K[(mk+offset) + (nk+offset)(2offset+1)];`
`268`	`268`	`}`
`269`	`269`	`}`