A few updates.

chriselrod · chriselrod · commit 7bdcfec27db6 · 2019-11-21T11:41:14.000-05:00
diff --git a/README.md b/README.md
@@ -14,3 +14,36 @@ Pkg.add(PackageSpec(url="https://github.com/chriselrod/SIMDPirates.jl"))
 Pkg.add(PackageSpec(url="https://github.com/chriselrod/SLEEFPirates.jl"))
 Pkg.add(PackageSpec(url="https://github.com/chriselrod/LoopVectorization.jl"))
 ```
+
+
+## Usage
+
+The current version of LoopVectorization provides a simple, dumb, transform on a single loop.
+What I mean by this is that it will not check for the transformations for validity. To be safe, I would straight loops that transform arrays or calculate reductions.
+
+For example,
+```julia
+function sum_simd(x)
+    s = zero(eltype(x))
+    @simd for xᵢ ∈ x
+        s += xᵢ
+    end
+    s
+end
+using LoopVectorization, BenchmarkTools
+function sum_loopvec(x::AbstractVector{Float64})
+    s = 0.0
+    @vvectorize for i ∈ eachindex(x)
+        s += x[i]
+    end
+    s
+end
+x = rand(99);
+@btime sum($x)
+
+@btime sum_simd($x)
+@btime sum_loopvec($x)
+```
+
+
+
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
@@ -2,7 +2,7 @@ module LoopVectorization
 
 using VectorizationBase, SIMDPirates, SLEEFPirates, MacroTools
 using VectorizationBase: REGISTER_SIZE, extract_data, num_vector_load_expr
-using SIMDPirates: VECTOR_SYMBOLS
+using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul
 using MacroTools: @capture, prewalk, postwalk
 
 export vectorizable, @vectorize, @vvectorize
diff --git a/src/costs.jl b/src/costs.jl
@@ -9,49 +9,79 @@
 struct InstructionCost
     scalar_latency::Int
     scalar_reciprical_throughput::Float64
-    scaling::Float64 # sentinel values: -2 == no scaling; -1 == scaling, >0 ->  == latency == reciprical throughput
-    
+    scaling::Float64 # sentinel values: -3 == no scaling; -2 == offset_scaling, -1 == linear scaling, >0 ->  == latency == reciprical throughput
+    register_pressure::Int
 end
-InstructionCost(sl, srt) = InstructionCost(sl, srt, NoCost)
+InstructionCost(sl, srt, scaling = -3.0) = InstructionCost(sl, srt, scaling, 0)
 
 function scalar_cost(instruction::InstructionCost)#, ::Type{T} = Float64) where {T}
     instruction.scalar_latency, instruction.scalar_reciprical_throughput
 end
 function vector_cost(instruction::InstructionCost, Wshift, ::Type{T} = Float64) where {T}
     sl, srt = scalar_cost(instruction)
     scaling = instruction.scaling
-    if scaling == NoCost || Wshift == 0
-        returnsl, srt
-    elseif scaling == Linear
+    if scaling == -3.0 || Wshift == 0
+        return sl, srt
+    elseif scaling == -2.0
         srt *= 1 << (Wshift + VectorizationBase.intlog2(sizeof(T)) - 4)
         if (sizeof(T) << Wshift) == VectorizationBase.REGISTER_SIZE # These instructions experience double latency with zmm
             sl += sl
         end
-    end
-    
+    elseif scaling == -1.0
+        W = 1 << Wshift
+        extra_latency = sl - srt
+        srt *= W
+        sl = srt + extra_latency
+    else
+        sl, srt = scaling, scaling
+    end    
     sl, srt
 end
 function cost(instruction::InstructionCost, Wshift, ::Type{T}) where {T}
     Wshift == 0 ? scalar_cost(instruction) : vector_cost(instruction, Wshift, T)
 end
 
+# Just a semi-reasonable assumption; should not be that sensitive to anything other than loads
+const OPAQUE_COST = InstructionSet(50.0, 50.0, -1.0, 32)
+
 const COST = Dict{Symbol,InstructionCost}(
     :getindex => InstructionCost(3,0.5),
     :setindex! => InstructionCost(3,1.0), # but not a part of dependency chains, so not really twice as expensive?
     :+ => InstructionCost(4,0.5),
     :- => InstructionCost(4,0.5),
     :* => InstructionCost(4,0.5),
-    :/ => InstructionCost(13,4.0,),
+    :/ => InstructionCost(13,4.0,-2.0),
+    :inv => InstructionCost(13,4.0,-2.0,1),
     :muladd => InstructionCost(0.5,4), # + and * will fuse into this, so much of the time they're not twice as expensive
-    :sqrt => InstructionCost(),
-    :log => InstructionCost(,,52.5),
-    :exp => InstructionCost(,,30.0),
-    :sin => InstructionCost(),
-    :cos => InstructionCost(),
-    :sincos => InstructionCost(),
-    :
+    :sqrt => InstructionCost(15,4.0,-2.0),
+    :log => InstructionCost(20,20.0,40.0,20),
+    :exp => InstructionCost(20,20.0,20.0,18),
+    :sin => InstructionCost(18,15.0,68.0,23),
+    :cos => InstructionCost(18,15.0,68.0,26),
+    :sincos => InstructionCost(25,22.0,70.0,26)
 )
 
+function sum_simd(x)
+    s = zero(eltype(x))
+    @simd for xᵢ ∈ x
+        s += xᵢ
+    end
+    s
+end
+using LoopVectorization, BenchmarkTools
+function sum_loopvec(x::AbstractVector{Float64})
+    s = 0.0
+    @vvectorize 4 for i ∈ eachindex(x)
+        s += x[i]
+    end
+    s
+end
+x = rand(99);
+@btime sum($x)
+@btime sum_simd($x)
+@btime sum_loopvec($x)
+
+
 # const SIMDPIRATES_COST = Dict{Symbol,InstructionCost}()
 # const SLEEFPIRATES_COST = Dict{Symbol,InstructionCost}()
 
diff --git a/src/graphs.jl b/src/graphs.jl
@@ -31,7 +31,7 @@ end
 function evaluate_cost(
     ls::LoopSet, order::ShortIntVector
 )
-    
+    included_vars = Set{Symbol}
 end
 
 # Here, we have to figure out how to convert the loopset into a vectorized expression.
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -224,6 +224,100 @@ end
         (ts_end - ts_start) / (2N*K), vsum(s_1)
     end
 end
+
+
+
+@generated function estimate_cost_onearg_llvmunroll(f::F, N::Int = 512, K = 1_000, ::Type{T} = Float64, ::Val{U} = Val(4)) where {F,T,U}
+    W, Wshift = VectorizationBase.pick_vector_width_shift(T)
+    Ushift = VectorizationBase.intlog2(U)
+    W <<= Ushift
+    quote    
+        s = vbroadcast(Vec{$W,$T}, zero(T))
+        x = rand(T, N << $Wshift)
+        ptrx = pointer(x)
+        ts_start, id_start = cpucycle_id()
+        for k ∈ 1:K
+            _ptrx = ptrx
+            for n ∈ 1:N>>$Ushift
+                v = vload(Vec{$W,$T}, _ptrx)
+                s = vadd(s, f(v))
+                _ptrx += VectorizationBase.REGISTER_SIZE*$U
+            end
+        end
+        ts_end, id_end = cpucycle_id()
+        @assert id_start == id_end
+        (ts_end - ts_start) / (N*K), vsum(s)
+    end
+end
+@generated function estimate_cost_onearg_tworet_llvmunroll(f::F, N::Int = 512, K = 1_000, ::Type{T} = Float64, ::Val{U} = Val(4)) where {F,T,U}
+    W, Wshift = VectorizationBase.pick_vector_width_shift(T)
+    Ushift = VectorizationBase.intlog2(U)
+    W <<= Ushift
+    quote    
+        s = vbroadcast(Vec{$W,$T}, zero(T))
+        x = rand(T, N << $Wshift)
+        ptrx = pointer(x)
+        ts_start, id_start = cpucycle_id()
+        for k ∈ 1:K
+            _ptrx = ptrx
+            for n ∈ 1:N>>$Ushift
+                v = vload(Vec{$W,$T}, _ptrx)
+                a, b = f(v)
+                s = vmuladd(a, b, s)
+                _ptrx += VectorizationBase.REGISTER_SIZE*$U
+            end
+        end
+        ts_end, id_end = cpucycle_id()
+        @assert id_start == id_end
+        (ts_end - ts_start) / (N*K), vsum(s)
+    end
+end
+@generated function estimate_cost_twoarg_llvmunroll(f::F, N::Int = 512, K = 1_000, ::Type{T} = Float64, ::Val{U} = Val(4)) where {F,T,U}
+    W, Wshift = VectorizationBase.pick_vector_width_shift(T)
+    Ushift = VectorizationBase.intlog2(U)
+    W <<= Ushift
+    quote    
+        s = vbroadcast(Vec{$W,$T}, one(T))
+        x = rand(T, N << $Wshift)
+        ptrx = pointer(x)
+        ts_start, id_start = cpucycle_id()
+        for k ∈ 1:K
+            _ptrx = ptrx
+            for n ∈ 1:N>>$Ushift
+                v = vload(Vec{$W,$T}, _ptrx)
+                s = f(v, s)
+                _ptrx += VectorizationBase.REGISTER_SIZE*$U
+            end
+        end
+        ts_end, id_end = cpucycle_id()
+        @assert id_start == id_end
+        (ts_end - ts_start) / (N*K), vsum(s)
+    end
+end
+@generated function estimate_cost_threearg_llvmunroll(f::F, N::Int = 512, K = 1_000, ::Type{T} = Float64, ::Val{U} = Val(4)) where {F,T,U}
+    W, Wshift = VectorizationBase.pick_vector_width_shift(T)
+    Ushift = VectorizationBase.intlog2(U)
+    W <<= Ushift
+    quote    
+        s = vbroadcast(Vec{$W,$T}, zero(T))
+        x = rand(T, N << $Wshift)
+        ptrx = pointer(x)
+        ts_start, id_start = cpucycle_id()
+        for k ∈ 1:K
+            _ptrx = ptrx
+            for n ∈ 1:N>>$Ushift
+                v = vload(Vec{$W,$T}, _ptrx)
+                s = f(v, v, s)
+                _ptrx += VectorizationBase.REGISTER_SIZE*$U
+            end
+        end
+        ts_end, id_end = cpucycle_id()
+        @assert id_start == id_end
+        (ts_end - ts_start) / (N*K), vsum(s)
+    end
+end
+
+
 estimate_cost_onearg_serial(exp, 512, 1_000, Float64, Val(1)) # 21
 estimate_cost_onearg_serial(exp, 512, 1_000, Float64, Val(2)) # 18.4
 estimate_cost_onearg_serial(exp, 512, 1_000, Float64, Val(4)) # 17.5
@@ -254,10 +348,15 @@ estimate_cost_onearg_tworet_serial(sincos, 512, 1_000, Float64, Val(2)) # 23
 estimate_cost_onearg_tworet_serial(sincos, 512, 1_000, Float64, Val(4)) # 22
 
 
-estimate_cost_onearg(SLEEFPirates.exp, 512, 1_000, Float64, Val(1)) # 28 # 21
-estimate_cost_onearg(SLEEFPirates.exp, 512, 1_000, Float64, Val(2)) # 28 # 20
+estimate_cost_onearg(SLEEFPirates.exp, 512, 1_000, Float64, Val(1)) # 48 # 21
+estimate_cost_onearg(SLEEFPirates.exp, 512, 1_000, Float64, Val(2)) # 52 # 20
 estimate_cost_onearg(SLEEFPirates.exp, 512, 1_000, Float64, Val(4)) # 28 # 19.5
 
+estimate_cost_onearg_llvmunroll(SLEEFPirates.exp, 512, 1_000, Float64, Val(1)) # 50 # 21
+estimate_cost_onearg_llvmunroll(SLEEFPirates.exp, 512, 1_000, Float64, Val(2)) # 40 # 20
+# estimate_cost_onearg_llvmunroll(SLEEFPirates.exp, 512, 1_000, Float64, Val(3)) # 40 # 
+estimate_cost_onearg_llvmunroll(SLEEFPirates.exp, 512, 1_000, Float64, Val(4)) # 32 # 19.5
+
 estimate_cost_onearg(SLEEFPirates.log, 512, 1_000, Float64, Val(1)) # 51 cycles # 44
 estimate_cost_onearg(SLEEFPirates.log, 512, 1_000, Float64, Val(2)) # 51 cycles # 40
 estimate_cost_onearg(SLEEFPirates.log, 512, 1_000, Float64, Val(4)) # 51 cycles # 39

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ end`
`31`	`31`	`function evaluate_cost(`
`32`	`32`	`ls::LoopSet, order::ShortIntVector`
`33`	`33`	`)`
`34`		`-`
	`34`	`+ included_vars = Set{Symbol}`
`35`	`35`	`end`
`36`	`36`
`37`	`37`	`# Here, we have to figure out how to convert the loopset into a vectorized expression.`