Broadcasting cannot use the new lowering approach. 32-bit builds only have 8 floating point registers available. Imrpove lowering with LLVM < 10 (10 was good already).

chriselrod · chriselrod · commit b09005245452 · 2020-05-25T02:26:24.000-04:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <elrodc@gmail.com>"]
-version = "0.8.0"
+version = "0.8.1"
 
 [deps]
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
@@ -15,10 +15,10 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 [compat]
 DocStringExtensions = "0.8"
 OffsetArrays = "1"
-SIMDPirates = "0.8.3"
+SIMDPirates = "0.8.4"
 SLEEFPirates = "0.5"
 UnPack = "0,1"
-VectorizationBase = "0.12.1"
+VectorizationBase = "0.12.2"
 julia = "1.1"
 
 [extras]
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
@@ -1,7 +1,7 @@
 module LoopVectorization
 
 using VectorizationBase, SIMDPirates, SLEEFPirates, UnPack, OffsetArrays
-using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, extract_data, num_vector_load_expr,
+using VectorizationBase: REGISTER_SIZE, extract_data, num_vector_load_expr,
     mask, masktable, pick_vector_width_val, valmul, valrem, valmuladd, valmulsub, valadd, valsub, _MM,
     maybestaticlength, maybestaticsize, staticm1, staticp1, staticmul, subsetview, vzero, stridedpointer_for_broadcast,
     Static, Zero, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange, unwrap, maybestaticrange,
@@ -29,6 +29,17 @@ export LowDimArray, stridedpointer,
 
 const VECTORWIDTHSYMBOL, ELTYPESYMBOL = Symbol("##Wvecwidth##"), Symbol("##Tloopeltype##")
 
+"""
+REGISTER_COUNT defined in VectorizationBase is supposed to correspond to the actual number of floating point registers on the system.
+It is hardcoded into a file at build time.
+However, someone may have multiple builds of Julia on the same system, some 32-bit and some 64-bit (e.g., they use 64-bit primarilly,
+but keep a 32-bit build on hand to debug test failures on Appveyor's 32-bit build). Thus, we don't want REGISTER_COUNT to be hardcoded
+in such a fashion. 
+32-bit builds are limited to only 8 floating point registers, so we take care of that here.
+
+If you want good performance, DO NOT use a 32-bit build of Julia if you don't have to.
+"""
+const REGISTER_COUNT = Sys.ARCH === :i686 ? 8 : VectorizationBase.REGISTER_COUNT
 
 include("vectorizationbase_extensions.jl")
 include("predicates.jl")
diff --git a/src/broadcast.jl b/src/broadcast.jl
@@ -245,6 +245,7 @@ end
     # @show typeof(dest)
     loopsyms = [gensym(:n) for n ∈ 1:N]
     ls = LoopSet(Mod)
+    ls.isbroadcast[] = true
     sizes = Expr(:tuple)
     for (n,itersym) ∈ enumerate(loopsyms)
         Nsym = gensym(:N)
@@ -271,6 +272,7 @@ end
     # need to construct the LoopSet
     loopsyms = [gensym(:n) for n ∈ 1:N]
     ls = LoopSet(Mod)
+    ls.isbroadcast[] = true
     pushpreamble!(ls, Expr(:(=), :dest, Expr(:call, :parent, :dest′)))
     sizes = Expr(:tuple)
     for (n,itersym) ∈ enumerate(loopsyms)
diff --git a/src/costs.jl b/src/costs.jl
@@ -91,7 +91,7 @@ function vector_cost(ic::InstructionCost, Wshift, sizeof_T)
     srt, sl, srp
 end
 
-const OPAQUE_INSTRUCTION = InstructionCost(-1.0, 50, 50.0, VectorizationBase.REGISTER_COUNT)
+const OPAQUE_INSTRUCTION = InstructionCost(-1.0, 50, 50.0, REGISTER_COUNT)
 
 instruction_cost(instruction::Instruction) = instruction.mod === :LoopVectorization ? COST[instruction.instr] : OPAQUE_INSTRUCTION
 instruction_cost(instruction::Symbol) = get(COST, instruction, OPAQUE_INSTRUCTION)
diff --git a/src/determinestrategy.jl b/src/determinestrategy.jl
@@ -370,7 +370,7 @@ function solve_unroll(
     W::Int, vectorized::Symbol,
     u₁loop::Loop, u₂loop::Loop
 )
-    maxu₂base = maxu₁base = VectorizationBase.REGISTER_COUNT == 32 ? 10 : 6#8
+    maxu₂base = maxu₁base = REGISTER_COUNT == 32 ? 10 : 6#8
     maxu₂ = maxu₂base#8
     maxu₁ = maxu₁base#8
     u₁L = length(u₁loop)
@@ -535,13 +535,13 @@ function load_elimination_cost_factor!(
         #     if isstaticloop(loop) && length(loop) ≤ 4
         #         itersym = loop.itersymbol
         #         if itersym !== u₁loopsym && itersym !== u₂loopsym
-        #             return (0.25, VectorizationBase.REGISTER_COUNT == 32 ? 2.0 : 1.0)
+        #             return (0.25, REGISTER_COUNT == 32 ? 2.0 : 1.0)
         #             # return (0.25, 1.0)
         #             return true
         #         end
         #     end
         # end
-        # # (0.25, VectorizationBase.REGISTER_COUNT == 32 ? 1.2 : 1.0)
+        # # (0.25, REGISTER_COUNT == 32 ? 1.2 : 1.0)
         # (0.25, 1.0)
         cost_vec[1] += 0.1rt
         reg_pressure[1] += 0.51rp
@@ -707,7 +707,7 @@ function evaluate_cost_tile(
         end
     end
     # @show cost_vec reg_pressure
-    costpenalty = (sum(reg_pressure) > VectorizationBase.REGISTER_COUNT) ? 2 : 1
+    costpenalty = (sum(reg_pressure) > REGISTER_COUNT) ? 2 : 1
     # @show order, vectorized cost_vec reg_pressure
     # @show solve_unroll(ls, u₁loopsym, u₂loopsym, cost_vec, reg_pressure)
     u₁, u₂, ucost = solve_unroll(ls, u₁loopsym, u₂loopsym, cost_vec, reg_pressure, W, vectorized)
diff --git a/src/graphs.jl b/src/graphs.jl
@@ -266,6 +266,7 @@ struct LoopSet
     unrollspecification::Base.RefValue{UnrollSpecification}
     loadelimination::Base.RefValue{Bool}
     lssm::Base.RefValue{LoopStartStopManager}
+    isbroadcast::Base.RefValue{Bool}
     mod::Symbol
 end
 
@@ -353,7 +354,8 @@ function LoopSet(mod::Symbol)
         Matrix{Float64}(undef, 4, 2),
         Matrix{Float64}(undef, 5, 2),
         Bool[], Bool[], Ref{UnrollSpecification}(),
-        Ref(false), Ref{LoopStartStopManager}(), mod
+        Ref(false), Ref{LoopStartStopManager}(),
+        Ref(false), mod
     )
 end
 
diff --git a/src/loopstartstopmanager.jl b/src/loopstartstopmanager.jl
@@ -12,28 +12,31 @@ function uniquearrayrefs(ls::LoopSet)
     uniquerefs
 end
 
-otherindexunrolled(loopsym::Symbol, ind::Symbol, loopdeps::Vector{Symbol}) = loopsym !== ind && loopsym ∈ loopdeps
+otherindexunrolled(loopsym::Symbol, ind::Symbol, loopdeps::Vector{Symbol}) = (loopsym !== ind) && (loopsym ∈ loopdeps)
 function otherindexunrolled(ls::LoopSet, ind::Symbol, ref::ArrayReferenceMeta)
     us = ls.unrollspecification[]
-    otherindexunrolled(getloopsym(ls, us.u₁loopnum), ind, loopdependencies(ref)) || otherindexunrolled(getloopsym(ls, us.u₂loopnum), ind, loopdependencies(ref))
+    u₁sym = names(ls)[us.u₁loopnum]
+    u₂sym = us.u₂loopnum > 0 ? names(ls)[us.u₂loopnum] : Symbol("##undefined##")
+    otherindexunrolled(u₁sym, ind, loopdependencies(ref)) || otherindexunrolled(u₂sym, ind, loopdependencies(ref))
 end
 multiple_with_name(n::Symbol, v::Vector{ArrayReferenceMeta}) = sum(ref -> n === vptr(ref), v) > 1
 # TODO: DRY between indices_calculated_by_pointer_offsets and use_loop_induct_var
 function indices_calculated_by_pointer_offsets(ls::LoopSet, ar::ArrayReferenceMeta)
-    looporder = names(ls)
     indices = getindices(ar)
+    ls.isbroadcast[] && return fill(false, length(indices))
+    looporder = names(ls)
     offset = isdiscontiguous(ar)
     gespinds = Expr(:tuple)
     out = Vector{Bool}(undef, length(indices))
     li = ar.loopedindex
     for i ∈ eachindex(li)
         ii = i + offset
         ind = indices[ii]
-        j = findfirst(isequal(ind), view(indices, offset+1:ii-1))
-        if !isnothing(j)
-            out[i] = out[j - offset]
-            continue
-        end
+        # j = findfirst(isequal(ind), view(indices, offset+1:ii-1))
+        # if !isnothing(j)
+        #     out[i] = out[j - offset]
+        #     continue
+        # end
         if (!li[i]) || multiple_with_name(vptr(ar), ls.lssm[].uniquearrayrefs)
             out[i] = false
         elseif (isone(ii) && (first(looporder) === ind))
@@ -61,19 +64,21 @@ function use_loop_induct_var!(ls::LoopSet, q::Expr, ar::ArrayReferenceMeta, alla
         println(ar)
         throw("Length of indices and length of offset do not match!")
     end
+    isbroadcast = ls.isbroadcast[]
     gespinds = Expr(:tuple)
     for i ∈ eachindex(li)
         ii = i + offset
         ind = indices[ii]
-        j = findfirst(isequal(ind), view(indices, offset+1:ii-1))
-        if !isnothing(j)
-            j -= offset
-            push!(gespinds.args, gespinds.args[j])
-            uliv[i] = uliv[j]
-        elseif (!li[i])
+        # j = findfirst(isequal(ind), view(indices, offset+1:ii-1))
+        # if !isnothing(j)
+        #     j -= offset
+        #     push!(gespinds.args, gespinds.args[j])
+        #     uliv[i] = uliv[j]
+        # else
+        if (!li[i])
             uliv[i] = 0
             push!(gespinds.args, Expr(:call, lv(:Zero)))
-        elseif (isone(ii) && (last(looporder) === ind)) && !(otherindexunrolled(ls, ind, ar)) || multiple_with_name(vptr(ar), allarrayrefs)
+        elseif isbroadcast || ((isone(ii) && (last(looporder) === ind)) && !(otherindexunrolled(ls, ind, ar)) || multiple_with_name(vptr(ar), allarrayrefs))
             uliv[i] = -findfirst(isequal(ind), looporder)::Int
             push!(gespinds.args, Expr(:call, lv(:Zero)))
         else
diff --git a/src/lower_memory_common.jl b/src/lower_memory_common.jl
@@ -39,25 +39,12 @@ function addoffset!(ret::Expr, ex, offset::Integer, _mm::Bool = false)
     nothing
 end
 function addoffset!(ret::Expr, offset::Int, _mm::Bool = false)
-    if iszero(offset)
-        ex = Expr(:call, lv(:Zero))
-        if _mm
-            push!(ret.args, _MMind(ex))
-        else
-            push!(ret.args, ex)
-        end
-    elseif isone(offset)
-        ex = Expr(:call, Expr(:curly, lv(:Static), offset))
-        if _mm
-            push!(ret.args, _MMind(ex))
-        else
-            push!(ret.args, ex)
-        end        
-    elseif _mm
-        push!(ret.args, _MMind(offset))
+    ex = Expr(:call, Expr(:curly, lv(:Static), offset))
+    if _mm
+        push!(ret.args, _MMind(ex))
     else
-        push!(ret.args, offset)
-    end
+        push!(ret.args, ex)
+    end        
     nothing
 end
 
@@ -146,6 +133,7 @@ function mem_offset_u(op::Operation, td::UnrollArgs, inds_calc_by_ptr_offset::Ve
     ret = Expr(:tuple)
     indices = getindicesonly(op)
     offsets = getoffsets(op)
+    # allbasezero = all(inds_calc_by_ptr_offset) && all(iszero, offsets)
     loopedindex = op.ref.loopedindex
     if iszero(incr₁) & iszero(incr₂)
         return mem_offset(op, td, inds_calc_by_ptr_offset)
diff --git a/test/broadcast.jl b/test/broadcast.jl
@@ -4,25 +4,25 @@
     for T ∈ (Float32, Float64, Int32, Int64)
         @show T, @__LINE__
         R = T <: Integer ? (T(-100):T(100)) : T
-        a = rand(R,100,100,100);
-        b = rand(R,100,100,1);
+        a = rand(R,99,99,99);
+        b = rand(R,99,99,1);
         bl = LowDimArray{(true,true,false)}(b);
-        br = reshape(b, (100,100));
+        br = reshape(b, (99,99));
         c1 = a .+ b;
         c2 = @avx a .+ bl;
         @test c1 ≈ c2
         fill!(c2, 99999); @avx c2 .= a .+ br;
         @test c1 ≈ c2
         fill!(c2, 99999); @avx c2 .= a .+ b;
         @test c1 ≈ c2
-        br = reshape(b, (100,1,100));
+        br = reshape(b, (99,1,99));
         bl = LowDimArray{(true,false,true)}(br);
         @. c1 = a + br;
         fill!(c2, 99999); @avx @. c2 = a + bl;
         @test c1 ≈ c2
         fill!(c2, 99999); @avx @. c2 = a + br;
         @test c1 ≈ c2
-        br = reshape(b, (1,100,100));
+        br = reshape(b, (1,99,99));
         bl = LowDimArray{(false,true,true)}(br);
         @. c1 = a + br;
         fill!(c2, 99999);
@@ -33,6 +33,16 @@
         max_ = maximum(xs, dims=1)
         @test (@avx exp.(xs .- LowDimArray{(false,)}(max_))) ≈ exp.(xs .- LowDimArray{(false,)}(max_))
 
+        if T === Int32
+            a = rand(T(1):T(100), 73, 1)
+            @test sqrt.(Float32.(a)) ≈ @avx sqrt.(a)
+        elseif T === Int64
+            a = rand(T(1):T(100), 73, 1)
+            @test sqrt.(a) ≈ @avx sqrt.(a)
+        else
+            a = rand(T, 73, 1)
+            @test sqrt.(a) ≈ @avx sqrt.(a)
+        end
         
         a = rand(R, M); B = rand(R, M, N); c = rand(R, N); c′ = c';
         d1 =      @. a + B * c′;
diff --git a/test/gemm.jl b/test/gemm.jl
diff --git a/test/gemv.jl b/test/gemv.jl
diff --git a/test/miscellaneous.jl b/test/miscellaneous.jl
diff --git a/test/runtests.jl b/test/runtests.jl