Fix offset adjustment for offset indices that don't have constant stride. Fixes #287.

chriselrod · chriselrod · commit bb2aef76ef3d · 2021-06-19T23:43:32.000-04:00
diff --git a/Project.toml b/Project.toml
@@ -30,5 +30,5 @@ Static = "0.2"
 StrideArraysCore = "0.1.12"
 ThreadingUtilities = "0.4.2"
 UnPack = "1"
-VectorizationBase = "0.20.17"
+VectorizationBase = "0.20.18"
 julia = "1.5"
diff --git a/src/codegen/lower_memory_common.jl b/src/codegen/lower_memory_common.jl
@@ -138,10 +138,10 @@ function mem_offset(op::Operation, td::UnrollArgs, inds_calc_by_ptr_offset::Vect
         indvectorized = _mm & (ind === vloopsym)
         offset = offsets[n] % Int
         stride = strides[n] % Int
-        ind_by_offset = inds_calc_by_ptr_offset[n] | (ind === CONSTANTZEROINDEX)
-        if !ind_by_offset
-            offset += (stride - 1)
+        if ind ≢ CONSTANTZEROINDEX
+          offset += (stride - 1)
         end
+        ind_by_offset = inds_calc_by_ptr_offset[n] | (ind === CONSTANTZEROINDEX)
         @unpack vstep = td
         if loopedindex[n]
             addoffset!(ret, indvectorized, vstep, stride, ind, offset, ind_by_offset) # 7 arg
@@ -297,8 +297,8 @@ function mem_offset_u(
             stride = convert(Int, strides[n])
             indvectorized = ind === vloopsym
             indvectorizedmm = _mm & indvectorized
-            if !ind_by_offset
-                offset += (stride - 1)
+            if ind ≢ CONSTANTZEROINDEX
+              offset += (stride - 1)
             end
             if ind === u₁loopsym
                 addvectoroffset!(ret, indvectorizedmm, incr₁, u₁step, vstep, stride, ind, offset, ind_by_offset, indvectorized) # 9 arg
diff --git a/src/modeling/graphs.jl b/src/modeling/graphs.jl
@@ -383,7 +383,7 @@ mutable struct LoopSet
     preamble::Expr
     prepreamble::Expr # performs extractions that must be performed first, and don't need further registering
     preamble_symsym::Vector{Tuple{Int,Symbol}}
-    preamble_symint::Vector{Tuple{Int,Tuple{Int,Int32,Bool}}}
+    preamble_symint::Vector{Tuple{Int,Tuple{Int,Int32,Bool}}} # (id,(intval,intsz,signed))
     preamble_symfloat::Vector{Tuple{Int,Float64}}
     preamble_zeros::Vector{Tuple{Int,NumberType}}
     preamble_funcofeltypes::Vector{Tuple{Int,Float64}}
diff --git a/src/parse/memory_ops_common.jl b/src/parse/memory_ops_common.jl
@@ -113,6 +113,7 @@ byterepresentable(x::Integer)::Bool = typemin(Int8) ≤ x ≤ typemax(Int8)
 function _addoffset!(indices, offsets, strides, loopedindex, loopdependencies, ind, offset, stride)
     push!(indices, ind)
     push!(offsets, offset % Int8)
+    # push!(offsets, (offset+stride-1) % Int8)
     push!(strides, stride % Int8)
     push!(loopedindex, true)
     push!(loopdependencies, ind)
@@ -249,8 +250,35 @@ function checkforoffset!(
     loopedindex::Vector{Bool}, loopdependencies::Vector{Symbol}, reduceddeps::Vector{Symbol}, ind::Expr
 )::Symbol
 
-    offset, mult_syms = affine_index_expression(ls, ind)
-    if !byterepresentable(offset)
+  offset, mult_syms = affine_index_expression(ls, ind)
+  let deleted = 0, N = length(mult_syms)
+    for n ∈ 1:N
+      ntemp = n - deleted
+      mlt, sym = mult_syms[ntemp]
+      opm = get(ls.opdict, sym, nothing)
+      opm === nothing && continue
+      isconstant(opm) || continue
+      found = false
+      for (opid,(intval,intsz,signed)) ∈ ls.preamble_symint
+        if opid == identifier(opm)
+          offset += intval * mlt
+          deleted += 1
+          deleteat!(mult_syms, ntemp)
+          found = true
+          break
+        end
+      end
+      found && continue
+      for (opid,nt) ∈ ls.preamble_zeros
+        if opid == identifier(opm)
+          deleted += 1
+          deleteat!(mult_syms, ntemp)
+          break
+        end
+      end
+    end
+  end
+  if !byterepresentable(offset)
         if length(mult_syms) == 1
             mlt,sym = only(mult_syms)
             if !byterepresentable(mlt)
@@ -263,7 +291,6 @@ function checkforoffset!(
         vptrarray = gesp_const_offset!(ls, vptrarray, ninds, indices, loopedindex, 1, offset - r)
         offset = r
     end
-    
     # (success && byterepresentable(offset)) || return false, vptrarray
     if length(mult_syms) == 0
         addconstindex!(indices, offsets, strides, loopedindex, offset)
diff --git a/test/shuffleloadstores.jl b/test/shuffleloadstores.jl
@@ -1,6 +1,3 @@
-
-
-
 function dot_simd(a::AbstractVector, b::AbstractVector)
     s = zero(eltype(a))
     @fastmath @inbounds @simd for i ∈ eachindex(a)
@@ -196,6 +193,94 @@ function sumdim2!(r1, r2)
   r1
 end
 
+# Issue 287
+ function my_gemm_noturbo!(out, s::Matrix{UInt8}, V)
+           Vcols = size(V, 2)
+           srows = size(s, 1)
+           scols = size(s, 2)
+           k = srows >> 2
+           rem = srows & 3
+           @inbounds @fastmath for c in 1:Vcols
+               for j in 1:scols
+                   for l in 1:k
+                       block = s[l, j]
+                       for p in 1:4
+                           Aij = (block >> (2 * (p - 1))) & 3
+                           out[4*(l - 1) + p, c] += ((Aij >= 2) + (Aij == 3)) * V[j, c]
+                       end
+                   end
+               end
+           end
+           # TODO handle rem
+       end
+function my_gemm_unroll(out, s::Matrix{UInt8}, V)
+  Vcols = size(V, 2)
+  srows = size(s, 1)
+  scols = size(s, 2)
+  k = srows >> 2
+  rem = srows & 3
+  @avx for c in 1:Vcols
+    for j in 1:scols
+      for l in 1:k
+        block = s[l, j]
+        for p in 1:4
+          Aij = (block >> (2 * (p - 1))) & 3
+          out[4*(l - 1) + p, c] += ((Aij >= 2) + (Aij == 3)) * V[j, c]
+        end
+      end
+    end
+  end
+  # TODO handle rem
+end
+function my_gemm_manual_unroll(out, s::Matrix{UInt8}, V)
+  Vcols = size(V, 2)
+  srows = size(s, 1)
+  scols = size(s, 2)
+  k = srows >> 2
+  rem = srows & 3
+  @avx for c in 1:Vcols
+    for j in 1:scols
+      for l in 1:k
+        block = s[l, j]
+        # unrolled loop
+        p = 1
+        Aij = (block >> (2 * (p - 1))) & 3
+        out[4*(l - 1) + p, c] += ((Aij >= 2) + (Aij == 3)) * V[j, c]
+        p = 2
+        Aij = (block >> (2 * (p - 1))) & 3
+        out[4*(l - 1) + p, c] += ((Aij >= 2) + (Aij == 3)) * V[j, c]
+        p = 3
+        Aij = (block >> (2 * (p - 1))) & 3
+        out[4*(l - 1) + p, c] += ((Aij >= 2) + (Aij == 3)) * V[j, c]
+        p = 4
+        Aij = (block >> (2 * (p - 1))) & 3
+        out[4*(l - 1) + p, c] += ((Aij >= 2) + (Aij == 3)) * V[j, c]
+      end
+    end
+  end
+  # TODO handle rem
+end
+function my_gemm_nexpr_unroll(out, s::Matrix{UInt8}, V)
+  Vcols = size(V, 2)
+  srows = size(s, 1)
+  scols = size(s, 2)
+  k = srows >> 2
+  rem = srows & 3
+  @turbo for c in 1:Vcols
+    for j in 1:scols
+      for l in 1:k
+        block = s[l, j]
+        # unrolled loop
+        Base.Cartesian.@nexprs 4 p -> begin
+          Aij = (block >> (2 * (p - 1))) & 3
+          out[4*(l - 1) + p, c] += ((Aij >= 2) + (Aij == 3)) * V[j, c]
+        end
+      end
+    end
+  end
+  # TODO handle rem
+end
+
 @testset "shuffles load/stores" begin
     @show @__LINE__
     for i ∈ 1:128
@@ -206,7 +291,7 @@ end
             @test dsimd ≈ cdot_mat(ac, bc)
         end
         @test dsimd ≈ cdot_affine(ac, bc) ≈ cdot_stride(ac, bc)
-        
+
 
         xq = [ntuple(_ -> rand(), Val(4)) for _ ∈ 1:i];
         yq = [ntuple(_ -> rand(), Val(4)) for _ ∈ 1:i];
@@ -230,7 +315,7 @@ end
             Aca = reinterpret(reshape, Float64, Ac);
             Bca = reinterpret(reshape, Float64, Bc);
             cmatmul_array!(Cca, Aca, Bca)
-            
+
             @test Cc1 ≈ Cc2# ≈ Cc3
         end
     end
@@ -250,11 +335,22 @@ end
         ϕ = view(fill(1e5+1e7im, 2*J+17, G+17, H+17, M+17), 9:2*J+9, 9:G+9, 9:H+9, 9:M+9) .= rand.() .+ rand.().*im;
         @test issue209(M, G, J, H, B, ϕ) ≈ issue209_noavx(M, G, J, H, B, ϕ)
     end
-  
+
     s = Array{Float64}(undef, 4, 128, 128);
     s2 = rand(4, 2, 128, 128);
     @test sumdim2_turbo!(s, s2) ≈ sumdim2!(similar(s), s2)
 
+  # issue 287
+  out_test = zeros(100, 10);
+  out_test1 = zeros(100, 10);
+  s = rand(UInt8, 25, 100);
+  V = rand(100, 10);
+  my_gemm_noturbo!(out_test, s, V);
+  my_gemm_unroll(out_test1, s, V);
+  @test out_test ≈ out_test1
+  my_gemm_manual_unroll(fill!(out_test1, 0), s, V);
+  @test out_test ≈ out_test1
+  my_gemm_nexpr_unroll(fill!(out_test1, 0), s, V);
+  @test out_test ≈ out_test1
+  
 end
-
-