Make shuffleloadstore functions nicer, only ignore cost of +/- loop-induct-var dependentent compute ops

chriselrod · chriselrod · commit ee37e09d9b24 · 2021-03-14T22:59:48.000-04:00
diff --git a/benchmark/openmp.c b/benchmark/openmp.c
@@ -0,0 +1,58 @@
+#include<omp.h>
+
+double dot(double* a, double* b, long N){
+  double s = 0.0;
+  #pragma omp parallel for reduction(+: s)
+  for(long n = 0; n < N; n++){
+    s += a[n]*b[n];
+  }
+  return s;
+}
+
+void cdot(double* c, double* a, double* b, long N){
+  double r = 0.0, i = 0.0;
+  #pragma omp parallel for reduction(+: r, i)
+  for(long n = 0; n < N; n++){
+    r += a[2*n] * b[2*n  ] + a[2*n+1] * b[2*n+1];
+    i += a[2*n] * b[2*n+1] - a[2*n+1] * b[2*n  ];
+  }
+  c[0] = r;
+  c[1] = i;
+  return;
+}
+
+void cdot3(double* c, double* x, double* A, double* y, long M, long N){
+  double sr = 0.0, si = 0.0;
+#pragma omp parallel for reduction(+: sr, si)
+  for (long n = 0; n < N; n++){
+    double tr = 0.0, ti = 0.0;
+    for(long m = 0; m < M; m++){
+      tr += x[2*m] * A[2*m   + n*N] + x[2*m+1] * A[2*m+1 + n*N];
+      ti += x[2*m] * A[2*m+1 + n*N] - x[2*m+1] * A[2*m   + n*N];
+    }
+    sr += tr * y[2*n  ] - ti * y[2*n+1];
+    si += tr * y[2*n+1] + ti * y[2*n  ];
+  }
+  c[0] = sr;
+  c[1] = si;
+  return;
+}
+
+void conv(double* B, double* A, double* K, long M, long N){
+  const long offset = 2;
+  #pragma omp parallel for collapse(2)
+  for (long i = offset; i < N-offset; i++){
+    for (long j = offset; j < M-offset; j++){
+      double tmp = 0.0;
+      for (long k = -offset; k < offset + 1; k++){
+	for (long l = -offset; l < offset + 1; l++){
+	  tmp += A[(j+l) + (i+k)*M] * K[(l+offset) + (k+offset)*(2*offset+1)];
+	}
+      }
+      B[(j-offset) + (i-offset) * (M-2*offset)] = tmp;
+    }
+  }
+  return;
+}
+
+
diff --git a/src/codegen/lower_threads.jl b/src/codegen/lower_threads.jl
@@ -138,11 +138,11 @@ end
 end
 
 # if a threaded loop is vectorized, call
-function choose_num_blocks(M, ::StaticInt{U}, nt) where {U}
+@inline function choose_num_blocks(M, ::StaticInt{U}, nt) where {U}
     _choose_num_blocks(M % UInt, StaticInt{U}(), nt, lv_max_num_threads())
 end
 # otherwise, call
-choose_num_blocks(nt, ::StaticInt{NC} = lv_max_num_threads()) where {NC} = @inbounds choose_num_block_table(StaticInt{NC}())[nt]
+@inline choose_num_blocks(nt, ::StaticInt{NC} = lv_max_num_threads()) where {NC} = @inbounds choose_num_block_table(StaticInt{NC}())[nt]
 
 
 
@@ -160,7 +160,7 @@ choose_num_blocks(nt, ::StaticInt{NC} = lv_max_num_threads()) where {NC} = @inbo
 #     block_per_m, blocks_per_n
 # end
 
-function choose_num_threads(::Val{C}, ::Val{NT}, x) where {C,NT}
+@inline function choose_num_threads(::Val{C}, ::Val{NT}, x) where {C,NT}
     fx = Base.uitofp(Float64, x)
     min(Base.fptoui(UInt, Base.ceil_llvm(0.05460264079015985*C*Base.sqrt_llvm(fx))), NT)
 end
@@ -194,7 +194,7 @@ function push_loop_length_expr!(q::Expr, ls::LoopSet)
     end
     nothing
 end
-function divrem_fast(numerator, denominator)
+@inline function divrem_fast(numerator, denominator)
     d = Base.udiv_int(numerator, denominator)
     r = numerator - denominator*d
     d, r
diff --git a/src/modeling/determinestrategy.jl b/src/modeling/determinestrategy.jl
@@ -74,7 +74,9 @@ function cost(ls::LoopSet, op::Operation, (u₁,u₂)::Tuple{Symbol,Symbol}, vlo
         if instr == Instruction(:-) || instr === Instruction(:sub_fast) || instr == Instruction(:+) || instr == Instruction(:add_fast)
             return 0.0, 0, 0.0
         end
-    elseif iscompute(op) && all(opp -> (isloopvalue(opp) | isconstant(opp)), parents(op))
+    elseif iscompute(op) &&
+        Base.sym_in(instruction(op).instr, (:(+), :(-), :add_fast, :sub_fast)) &&
+        all(opp -> (isloopvalue(opp) | isconstant(opp)), parents(op))
         return 0.0, 0, 0.0
     end
     opisvectorized = isvectorized(op)
@@ -189,7 +191,8 @@ function evaluate_cost_unroll(
             included_vars[id] = true
             # @show op, cost(ls, op, vloopsym, Wshift, size_T)
             # TODO: use actual unrolls here?
-            total_cost += iter * first(cost(ls, op, (Symbol(""),Symbol("")), vloopsym, Wshift, size_T))
+            c = first(cost(ls, op, (Symbol(""),Symbol("")), vloopsym, Wshift, size_T))
+            total_cost += iter * c
             total_cost > max_cost && return total_cost # abort if more expensive; we only want to know the cheapest
         end
     end
diff --git a/test/shuffleloadstores.jl b/test/shuffleloadstores.jl
@@ -8,34 +8,37 @@ function dot_simd(a::AbstractVector, b::AbstractVector)
     end
     s
 end
-function cdot_mat(a::AbstractMatrix, b::AbstractMatrix)
-    re = zero(eltype(a))
-    im = zero(eltype(a))
+function cdot_mat(ca::AbstractVector{Complex{T}}, cb::AbstractVector{Complex{T}}) where {T}
+    a = reinterpret(reshape, T, ca)
+    b = reinterpret(reshape, T, cb)
+    re = zero(T); im = zero(T)
     @avx for i ∈ axes(a,2)
         re += a[1,i] * b[1,i] + a[2,i] * b[2,i]
         im += a[1,i] * b[2,i] - a[2,i] * b[1,i]
     end
-    Complex(re,im)
+    Complex(re, im)
 end
-function cdot_affine(a::AbstractVector, b::AbstractVector)
-    re = zero(eltype(a))
-    im = zero(eltype(a))
+function cdot_affine(ca::AbstractVector{Complex{T}}, cb::AbstractVector{Complex{T}}) where {T}
+    a = reinterpret(T, ca);
+    b = reinterpret(T, cb);
+    re = zero(T); im = zero(T)
     # with a multiplier, we go from `i = 1 -> 2i = 2` to `i = 0 -> 2i = 0
     # 2(i+1-1) = 2i + 2 - 2, so....
     @avx for i ∈ 1:length(a)>>>1
         re += a[2i-1] * b[2i-1] + a[2i] * b[2i  ]
         im += a[2i-1] * b[2i  ] - a[2i] * b[2i-1]
     end
-    Complex(re,im)
+    Complex(re, im)
 end
-function cdot_stride(a::AbstractVector, b::AbstractVector)
-    re = zero(eltype(a))
-    im = zero(eltype(a))
+function cdot_stride(ca::AbstractVector{Complex{T}}, cb::AbstractVector{Complex{T}}) where {T}
+    a = reinterpret(T, ca);
+    b = reinterpret(T, cb);
+    re = zero(T); im = zero(T)
     @avx for i ∈ 1:2:length(a)
         re += a[i] * b[i  ] + a[i+1] * b[i+1]
         im += a[i] * b[i+1] - a[i+1] * b[i  ]
     end
-    Complex(re,im)
+    Complex(re, im)
 end
 function qdot_simd(x::AbstractVector{NTuple{4,T}}, y::AbstractVector{NTuple{4,T}}) where {T}
     a = zero(T)
@@ -132,15 +135,11 @@ end
     for i ∈ 1:128
         ac = rand(Complex{Float64}, i);
         bc = rand(Complex{Float64}, i);
-        acv = reinterpret(Float64, ac);
-        bcv = reinterpret(Float64, bc);
         dsimd = dot_simd(ac, bc)
         if VERSION ≥ v"1.6.0-rc1"
-            acm = reinterpret(reshape, Float64, ac);
-            bcm = reinterpret(reshape, Float64, bc);
-            @test dsimd ≈ cdot_mat(acm, bcm)
+            @test dsimd ≈ cdot_mat(ac, bc)
         end
-        @test dsimd ≈ cdot_affine(acv, bcv) ≈ cdot_stride(acv, bcv)
+        @test dsimd ≈ cdot_affine(ac, bc) ≈ cdot_stride(ac, bc)
 
 
         xq = [ntuple(_ -> rand(), Val(4)) for _ ∈ 1:i];
diff --git a/test/threading.jl b/test/threading.jl
@@ -17,9 +17,9 @@ function AmulB!(C,A,B)
     C
 end
 function dot3(x::AbstractVector{Complex{T}}, A::AbstractMatrix{Complex{T}}, y::AbstractVector{Complex{T}}) where {T}
-    xr = reinterpret(reshape, Float64, x);
-    yr = reinterpret(reshape, Float64, y);
-    Ar = reinterpret(reshape, Float64, A);
+    xr = reinterpret(reshape, T, x);
+    yr = reinterpret(reshape, T, y);
+    Ar = reinterpret(reshape, T, A);
     sre = zero(T)
     sim = zero(T)
     @avxt for n in axes(Ar,3)