Fix for broadcasting Float32 and assigning zero(eltype(A)) within loop body.

chriselrod · chriselrod · commit a2ce880c687f · 2020-01-02T01:20:33.000-05:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <elrodc@gmail.com>"]
-version = "0.1.3"
+version = "0.1.4"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
diff --git a/README.md b/README.md
@@ -9,10 +9,7 @@
 ## Installation
 ```
 using Pkg
-Pkg.add(PackageSpec(url="https://github.com/chriselrod/VectorizationBase.jl"))
-Pkg.add(PackageSpec(url="https://github.com/chriselrod/SIMDPirates.jl"))
-Pkg.add(PackageSpec(url="https://github.com/chriselrod/SLEEFPirates.jl"))
-Pkg.add(PackageSpec(url="https://github.com/chriselrod/LoopVectorization.jl"))
+Pkg.add("LoopVectorization")
 ```
 
 
diff --git a/src/broadcast.jl b/src/broadcast.jl
@@ -18,14 +18,27 @@ function Base.Broadcast._broadcast_getindex_eltype(p::Product)
     )
 end
 
+# recursive_eltype(::Type{A}) where {T, A <: AbstractArray{T}} = T
+# recursive_eltype(::Type{NTuple{N,T}}) where {N,T<:Union{Float32,Float64}} = T
+# recursive_eltype(::Type{Float32}) = Float32
+# recursive_eltype(::Type{Float64}) = Float64
+# recursive_eltype(::Type{Tuple{T}}) where {T} = T
+# recursive_eltype(::Type{Tuple{T1,T2}}) where {T1,T2} = promote_type(recursive_eltype(T1), recursive_eltype(T2))
+# recursive_eltype(::Type{Tuple{T1,T2,T3}}) where {T1,T2,T3} = promote_type(recursive_eltype(T1), recursive_eltype(T2), recursive_eltype(T3))
+# recursive_eltype(::Type{Tuple{T1,T2,T3,T4}}) where {T1,T2,T3,T4} = promote_type(recursive_eltype(T1), recursive_eltype(T2), recursive_eltype(T3), recursive_eltype(T4))
+# recursive_eltype(::Type{Tuple{T1,T2,T3,T4,T5}}) where {T1,T2,T3,T4,T5} = promote_type(recursive_eltype(T1), recursive_eltype(T2), recursive_eltype(T3), recursive_eltype(T4), recursive_eltype(T5))
+
+# function recursive_eltype(::Type{Broadcasted{S,A,F,ARGS}}) where {S,A,F,ARGS}
+#     recursive_eltype(ARGS)
+# end
 
 @inline ∗(a::A, b::B) where {A,B} = Product{A,B}(a, b)
 @inline Base.Broadcast.broadcasted(::typeof(∗), a::A, b::B) where {A, B} = Product{A,B}(a, b)
 # TODO: Need to make this handle A or B being (1 or 2)-D broadcast objects.
 function add_broadcast!(
     ls::LoopSet, mC::Symbol, bcname::Symbol, loopsyms::Vector{Symbol},
     ::Type{Product{A,B}}, elementbytes::Int = 8
-) where {T,A,B}
+) where {A, B}
     K = gensym(:K)
     mA = gensym(:Aₘₖ)
     mB = gensym(:Bₖₙ)
@@ -54,7 +67,12 @@ function add_broadcast!(
     # load B
     loadB = add_broadcast!(ls, gensym(:B), mB, bloopsyms, B, elementbytes)
     # set Cₘₙ = 0
-    setC = add_constant!(ls, 0.0, cloopsyms, mC, elementbytes)
+    # setC = add_constant!(ls, zero(promote_type(recursive_eltype(A), recursive_eltype(B))), cloopsyms, mC, elementbytes)
+    setC = if elementbytes == 4
+        add_constant!(ls, 0f0, cloopsyms, mC, elementbytes)
+    else#if elementbytes == 4
+        add_constant!(ls, 0.0, cloopsyms, mC, elementbytes)
+    end       
     # compute Cₘₙ += Aₘₖ * Bₖₙ
     reductop = Operation(
         ls, mC, elementbytes, :vmuladd, compute, reductdeps, Symbol[k], Operation[loadA, loadB, setC]
@@ -118,7 +136,7 @@ function add_broadcast!(
         argname = gensym(:arg)
         pushpreamble!(ls, Expr(:(=), argname, Expr(:ref, bcargs, i)))
         # dynamic dispatch
-        parent = add_broadcast!(ls, gensym(:temp), argname, loopsyms, arg)::Operation
+        parent = add_broadcast!(ls, gensym(:temp), argname, loopsyms, arg, elementbytes)::Operation
         pushparent!(parents, deps, reduceddeps, parent)
     end
     op = Operation(
@@ -130,7 +148,7 @@ end
 # size of dest determines loops
 @generated function vmaterialize!(
     dest::AbstractArray{T,N}, bc::BC
-) where {T, N, BC <: Broadcasted}
+) where {T <: Union{Float32,Float64}, N, BC <: Broadcasted}
 # ) where {N, T, BC <: Broadcasted}
     # we have an N dimensional loop.
     # need to construct the LoopSet
@@ -143,8 +161,9 @@ end
         push!(sizes.args, Nsym)
     end
     pushpreamble!(ls, Expr(:(=), sizes, Expr(:call, :size, :dest)))
-    add_broadcast!(ls, :dest, :bc, loopsyms, BC)
-    add_store!(ls, :dest, ArrayReference(:dest, loopsyms, Ref{Bool}(false)))
+    elementbytes = sizeof(T)
+    add_broadcast!(ls, :dest, :bc, loopsyms, BC, elementbytes)
+    add_store!(ls, :dest, ArrayReference(:dest, loopsyms, Ref{Bool}(false)), elementbytes)
     resize!(ls.loop_order, num_loops(ls)) # num_loops may be greater than N, eg Product
     q = lower(ls)
     push!(q.args, :dest)
diff --git a/src/graphs.jl b/src/graphs.jl
@@ -480,8 +480,13 @@ function add_operation!(
     if RHS.head === :ref
         add_load_ref!(ls, LHS, RHS, elementbytes)
     elseif RHS.head === :call
-        if first(RHS.args) === :getindex
+        f = first(RHS.args)
+        if f === :getindex
             add_load_getindex!(ls, LHS, RHS, elementbytes)
+        elseif f === :zero || f === :one
+            c = gensym(:constant)
+            pushpreamble!(ls, Expr(:(=), c, RHS))
+            add_constant!(ls, c, [keys(ls.loops)...], LHS, elementbytes)
         else
             add_compute!(ls, LHS, RHS, elementbytes)
         end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -36,7 +36,7 @@ using LoopVectorization
 
     @testset "GEMM" begin
         gemmq = :(for i ∈ 1:size(A,1), j ∈ 1:size(B,2)
-                  Cᵢⱼ = z#ero(eltype(C))
+                  Cᵢⱼ = zero(eltype(C))
                   for k ∈ 1:size(A,2)
                   Cᵢⱼ += A[i,k] * B[k,j]
                   end
@@ -57,9 +57,8 @@ using LoopVectorization
             end
         end
         function mygemmavx!(C, A, B)
-            z = zero(eltype(C))
             @avx for i ∈ 1:size(A,1), j ∈ 1:size(B,2)
-                Cᵢⱼ = z
+                Cᵢⱼ = zero(eltype(C))
                 for k ∈ 1:size(A,2)
                     Cᵢⱼ += A[i,k] * B[k,j]
                 end
@@ -202,9 +201,8 @@ using LoopVectorization
             end
         end
         function mygemvavx!(y, A, x)
-            z = zero(eltype(y))
             @avx for i ∈ eachindex(y)
-                yᵢ = z
+                yᵢ = zero(eltype(y))
                 for j ∈ eachindex(x)
                     yᵢ += A[i,j] * x[j]
                 end
@@ -262,9 +260,8 @@ using LoopVectorization
     end
 
     function mycolsumavx!(x, A)
-        z = zero(eltype(x))
         @avx for j ∈ eachindex(x)
-            xⱼ = z
+            xⱼ = zero(eltype(x))
             for i ∈ 1:size(A,2)
                 xⱼ += A[j,i]
             end
@@ -290,9 +287,8 @@ using LoopVectorization
         end
     end
     function myvaravx!(s², A, x̄)
-        z = zero(eltype(s²))
         @avx for j ∈ eachindex(s²)
-            s²ⱼ = z
+            s²ⱼ = zero(eltype(s²))
             x̄ⱼ = x̄[j]
             for i ∈ 1:size(A,2)
                 δ = A[j,i] - x̄ⱼ
@@ -328,7 +324,8 @@ end
     M, N = 37, 47
     # M = 77;
     # for T ∈ (Float32, Float64)
-    let T = Float64
+    for T ∈ (Float64, Float32)
+    # let T = Float64
         a = rand(T, M); B = rand(T, M, N); c = rand(T, N); c′ = c';
 
         d1 =      @. a + B * c′;