Skip to content

Commit ca00df6

Browse files
committed
Loop splitting needs to add reductions. Also, currently setting split loop's vector width to equal olds, but this needs improvement.
1 parent 2cba920 commit ca00df6

File tree

2 files changed

+23
-4
lines changed

2 files changed

+23
-4
lines changed

src/reconstruct_loopset.jl

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -417,8 +417,15 @@ function extract_external_functions!(ls::LoopSet, offset::Int)
417417
end
418418
function sizeofeltypes(v, num_arrays)::Int
419419
T = typeeltype(v[1])
420+
if !VectorizationBase.SIMD_NATIVE_INTEGERS && T <: Integer # hack
421+
return VectorizationBase.REGISTER_SIZE
422+
end
420423
for i 2:num_arrays
421-
T = promote_type(T, typeeltype(v[i]))
424+
Ttemp = typeeltype(v[i])
425+
if !VectorizationBase.SIMD_NATIVE_INTEGERS && Ttemp <: Integer # hack
426+
return VectorizationBase.REGISTER_SIZE
427+
end
428+
T = promote_type(T, Ttemp)
422429
end
423430
sizeof(T)
424431
end
@@ -485,8 +492,9 @@ Execute an `@avx` block. The block's code is represented via the arguments:
485492
- `vargs...` holds the encoded pointers of all the arrays (see `VectorizationBase`'s various pointer types).
486493
"""
487494
@generated function _avx_!(::Val{UNROLL}, ::Type{OPS}, ::Type{ARF}, ::Type{AM}, ::Type{LPSYM}, lb::LB, vargs...) where {UNROLL, OPS, ARF, AM, LPSYM, LB}
488-
1 + 1 # Irrelevant line you can comment out/in to force recompilation...
495+
# 1 + 1 # Irrelevant line you can comment out/in to force recompilation...
489496
ls = _avx_loopset(OPS.parameters, ARF.parameters, AM.parameters, LPSYM.parameters, LB.parameters, vargs)
490497
# @show avx_body(ls, UNROLL)
498+
# @show UNROLL, OPS, ARF, AM, LPSYM, LB
491499
avx_body(ls, UNROLL)
492500
end

src/split_loops.jl

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,7 @@ end
2020
function append_if_included!(vnew, vold, included)
2121
for (i, v) vold
2222
id = included[i]
23-
iszero(id) && continue
24-
push!(vnew, (id, v))
23+
iszero(id) || push!(vnew, (id, v))
2524
end
2625
end
2726

@@ -44,6 +43,18 @@ function split_loopset(ls::LoopSet, ids)
4443
append_if_included!(ls_new.preamble_symfloat, ls.preamble_symfloat, included)
4544
append_if_included!(ls_new.preamble_zeros, ls.preamble_zeros, included)
4645
append_if_included!(ls_new.preamble_funcofeltypes, ls.preamble_funcofeltypes, included)
46+
for i ls.outer_reductions
47+
id = included[i]
48+
iszero(id) || push!(ls_new.outer_reductions, id)
49+
end
50+
# TODO: allow them to differ. E.g., non-AVX2 x86 cpus don't have efficient integer calculations
51+
# Therefore, it would be profitable to split for this reason.
52+
# However, currently the default assumption in vector width will be wrong, so we should calculate
53+
# it correctly (like ls.vector_width[]); wrong (too high) value will encourage splitting when
54+
# it shouldn't.
55+
# Current behavior is incorrect when VECWIDTH chosen does actually differ between
56+
# split loops and the loops are statically sized, because code gen will then assume it is correct...
57+
ls_new.vector_width[] = ls.vector_width[]
4758
ls_new
4859
end
4960

0 commit comments

Comments
 (0)