Resolve #13.

chriselrod · chriselrod · commit c32809e5c109 · 2020-01-08T19:52:58.000-05:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <elrodc@gmail.com>"]
-version = "0.3.2"
+version = "0.3.3"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
diff --git a/README.md b/README.md
@@ -157,7 +157,7 @@ d2 = @avx @. a + B * c′;
 can be optimized in a similar manner to BLAS, albeit to a much smaller degree because the naive version already benefits from vectorization (unlike the naive BLAS).
 
 
-You can also use `∗` (which is typed `\ast` and not to be confused with `*`) for lazy matrix multiplication that can fuse with broadcasts. `.\ast` behaves similarly, espcaping the broadcast (it is not applied elementwise). This allows you to use `@.` and fuse all the loops, even if the arguments to `\ast` are themselves broadcasted objects. However, it will often be the case that creating an intermediary is faster. I would recomend always checking if splitting the operation into pieces, or at least isolating the matrix multiplication, increases performance. That will often be the case, especially if the matrices are large, where a separate multiplication can leverage BLAS (and perhaps take advantage of threads).
+You can also use `*ˡ` (which is typed `*\^l`) for lazy matrix multiplication that can fuse with broadcasts. `.*ˡ` behaves similarly, espcaping the broadcast (it is not applied elementwise). This allows you to use `@.` and fuse all the loops, even if the arguments to `*ˡ` are themselves broadcasted objects. However, it will often be the case that creating an intermediary is faster. I would recomend always checking if splitting the operation into pieces, or at least isolating the matrix multiplication, increases performance. That will often be the case, especially if the matrices are large, where a separate multiplication can leverage BLAS (and perhaps take advantage of threads).
 
 At small sizes, this can be fast.
 ```julia
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
@@ -10,7 +10,7 @@ using MacroTools: prewalk, postwalk
 
 
 export LowDimArray, stridedpointer, vectorizable,
-    @avx, ∗,
+    @avx, *ˡ, ∗,
     vmap, vmap!
 
 
diff --git a/src/broadcast.jl b/src/broadcast.jl
@@ -36,8 +36,9 @@ end
 #     recursive_eltype(ARGS)
 # end
 
-@inline ∗(a::A, b::B) where {A,B} = Product{A,B}(a, b)
-@inline Base.Broadcast.broadcasted(::typeof(∗), a::A, b::B) where {A, B} = Product{A,B}(a, b)
+@inline *ˡ(a::A, b::B) where {A,B} = Product{A,B}(a, b)
+@inline Base.Broadcast.broadcasted(::typeof(*ˡ), a::A, b::B) where {A, B} = Product{A,B}(a, b)
+const ∗ = *ˡ
 # TODO: Need to make this handle A or B being (1 or 2)-D broadcast objects.
 function add_broadcast!(
     ls::LoopSet, mC::Symbol, bcname::Symbol, loopsyms::Vector{Symbol},
diff --git a/src/graphs.jl b/src/graphs.jl
@@ -162,15 +162,6 @@ getop(ls::LoopSet, s::Symbol) = ls.opdict[s]
 getop(ls::LoopSet, i::Int) = ls.operations[i + 1]
 
 @inline extract_val(::Val{N}) where {N} = N
-function determine_veced_increment(ls::LoopSet, iter::Symbol, isunrolled::Bool, W::Symbol, U::Int) # , istiled::Bool, ..., T::Int # may not be tiled
-    if isunrolled
-        Expr(:call, lv(:valmul), W, U)
-    # elseif istiled
-        # Expr(:call, lv(:valmul), W, T)
-    else
-        Expr(:call, lv(:extract_val), W)
-    end
-end
 function vec_looprange(ls::LoopSet, s::Symbol, isunrolled::Bool, W::Symbol, U::Int, loop = ls.loops[s])
     incr = if isunrolled
         Expr(:call, lv(:valmuladd), W, U, -1)
@@ -191,15 +182,15 @@ function looprange(ls::LoopSet, s::Symbol, incr::Int = 1, mangledname::Symbol =
         Expr(:call, :<, mangledname, loop.hintexact ? loop.rangehint - incr : Expr(:call, :-, loop.rangesym, incr))
     end
 end
-function looprange(ls::LoopSet, s::Symbol, incr::Expr, mangledname::Symbol = s, loop = ls.loops[s])
-    increxpr = Expr(:call, :-, incr, 1)
-    increxpr = if loop.hintexact
-        Expr(:call, :-, loop.rangehint, increxpr)
-    else
-        Expr(:call, :-, loop.rangesym, increxpr)
-    end
-    Expr(:call, :<, mangledname, increxpr)
-end
+# function looprange(ls::LoopSet, s::Symbol, incr::Expr, mangledname::Symbol = s, loop = ls.loops[s])
+#     increxpr = Expr(:call, :-, incr, 1)
+#     increxpr = if loop.hintexact
+#         Expr(:call, :-, loop.rangehint, increxpr)
+#     else
+#         Expr(:call, :-, loop.rangesym, increxpr)
+#     end
+#     Expr(:call, :<, mangledname, increxpr)
+# end
 
 function Base.length(ls::LoopSet, is::Symbol)
     ls.loops[is].rangehint
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -62,13 +62,13 @@ using LinearAlgebra
                 C[m,n] = Cₘₙ
             end
         end
-        function AmuladdBavx!(C, A, B)
+        function AmuladdBavx!(C, A, B, factor = 1)
             @avx for m ∈ 1:size(A,1), n ∈ 1:size(B,2)
                 ΔCₘₙ = zero(eltype(C))
                 for k ∈ 1:size(A,2)
                     ΔCₘₙ += A[m,k] * B[k,n]
                 end
-                C[m,n] += ΔCₘₙ
+                C[m,n] += ΔCₘₙ * factor
             end
         end
 
@@ -178,6 +178,8 @@ using LinearAlgebra
             @test C ≈ C2
             AmuladdBavx!(C, A, B)
             @test C ≈ 2C2
+            AmuladdBavx!(C, A, B, -1)
+            @test C ≈ C2
             At = copy(A');
             fill!(C, 9999.999); AtmulBavx!(C, At, B)
             @test C ≈ C2
@@ -475,25 +477,25 @@ end
 
         d3 = a .+ B * c;
         # no method matching _similar_for(::UnitRange{Int64}, ::Type{Any}, ::Product)
-        d4 = @avx a .+ B ∗ c;
+        d4 = @avx a .+ B *ˡ c;
         @test d3 ≈ d4
 
         fill!(d3, -1000.0);
         fill!(d4, 91000.0);
 
         d3 .= a .+ B * c;
-        @avx d4 .= a .+ B ∗ c;
+        @avx d4 .= a .+ B *ˡ c;
         @test d3 ≈ d4
 
         fill!(d4, 91000.0);
-        @avx @. d4 = a + B ∗ c;
+        @avx @. d4 = a + B *ˡ c;
         @test d3 ≈ d4
 
         M, K, N = 77, 83, 57;
         A = rand(T,M,K); B = rand(T,K,N); C = rand(T,M,N);
 
         D1 = C .+ A * B;
-        D2 = @avx C .+ A ∗ B;
+        D2 = @avx C .+ A *ˡ B;
         @test D1 ≈ D2
 
         D3 = exp.(B');