Loop splitting needs to add reductions. Also, currently setting split loop's vector width to equal olds, but this needs improvement.

chriselrod · chriselrod · commit ca00df66e7df · 2020-06-29T00:54:08.000-05:00
diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl
@@ -417,8 +417,15 @@ function extract_external_functions!(ls::LoopSet, offset::Int)
 end
 function sizeofeltypes(v, num_arrays)::Int
     T = typeeltype(v[1])
+    if !VectorizationBase.SIMD_NATIVE_INTEGERS && T <: Integer # hack
+        return VectorizationBase.REGISTER_SIZE
+    end
     for i ∈ 2:num_arrays
-        T = promote_type(T, typeeltype(v[i]))
+        Ttemp = typeeltype(v[i])
+        if !VectorizationBase.SIMD_NATIVE_INTEGERS && Ttemp <: Integer # hack
+            return VectorizationBase.REGISTER_SIZE
+        end 
+        T = promote_type(T, Ttemp)
     end
     sizeof(T)
 end
@@ -485,8 +492,9 @@ Execute an `@avx` block. The block's code is represented via the arguments:
 - `vargs...` holds the encoded pointers of all the arrays (see `VectorizationBase`'s various pointer types).
 """
 @generated function _avx_!(::Val{UNROLL}, ::Type{OPS}, ::Type{ARF}, ::Type{AM}, ::Type{LPSYM}, lb::LB, vargs...) where {UNROLL, OPS, ARF, AM, LPSYM, LB}
-    1 + 1 # Irrelevant line you can comment out/in to force recompilation...
+    # 1 + 1 # Irrelevant line you can comment out/in to force recompilation...
     ls = _avx_loopset(OPS.parameters, ARF.parameters, AM.parameters, LPSYM.parameters, LB.parameters, vargs)
     # @show avx_body(ls, UNROLL)
+    # @show UNROLL, OPS, ARF, AM, LPSYM, LB
     avx_body(ls, UNROLL)
 end
diff --git a/src/split_loops.jl b/src/split_loops.jl
@@ -20,8 +20,7 @@ end
 function append_if_included!(vnew, vold, included)
     for (i, v) ∈ vold
         id = included[i]
-        iszero(id) && continue
-        push!(vnew, (id, v))
+        iszero(id) || push!(vnew, (id, v))
     end
 end
 
@@ -44,6 +43,18 @@ function split_loopset(ls::LoopSet, ids)
     append_if_included!(ls_new.preamble_symfloat, ls.preamble_symfloat, included)
     append_if_included!(ls_new.preamble_zeros, ls.preamble_zeros, included)
     append_if_included!(ls_new.preamble_funcofeltypes, ls.preamble_funcofeltypes, included)
+    for i ∈ ls.outer_reductions
+        id = included[i]
+        iszero(id) || push!(ls_new.outer_reductions, id)
+    end
+    # TODO: allow them to differ. E.g., non-AVX2 x86 cpus don't have efficient integer calculations
+    # Therefore, it would be profitable to split for this reason.
+    # However, currently the default assumption in vector width will be wrong, so we should calculate
+    # it correctly (like ls.vector_width[]); wrong (too high) value will encourage splitting when
+    # it shouldn't.
+    # Current behavior is incorrect when VECWIDTH chosen does actually differ between
+    # split loops and the loops are statically sized, because code gen will then assume it is correct...
+    ls_new.vector_width[] = ls.vector_width[] 
     ls_new
 end