Skip to content

Commit b855b81

Browse files
committed
Fix Tullio issue #131.
1 parent 87ca11a commit b855b81

File tree

5 files changed

+71
-37
lines changed

5 files changed

+71
-37
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.12.97"
4+
version = "0.12.98"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"

src/codegen/lower_load.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -456,8 +456,8 @@ function lower_load_collection!(
456456
offset_dummy_loop = Loop(first(opindices), MaybeKnown(1), MaybeKnown(1024), MaybeKnown(1), Symbol(""), Symbol(""))
457457
unrollcurl₂ = unrolled_curly(op, nouter, offset_dummy_loop, vloop, mask, 1) # interleave always 1 here
458458
inds = mem_offset_u(op, ua, inds_calc_by_ptr_offset, false, 0, ls, false)
459+
# @show op suffix, inds
459460
falseexpr = Expr(:call, lv(:False)); rs = staticexpr(reg_size(ls));
460-
461461
opu₁, opu₂ = isunrolled_sym(op, u₁loopsym, u₂loopsym, vloopsym, ls)
462462
manualunrollu₁ = if opu₁ && u₁ > 1 # both unrolled
463463
if isknown(step(u₁loop)) && sum(Base.Fix2(===,u₁loopsym), getindicesonly(op)) == 1

src/codegen/lower_memory_common.jl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,12 +307,14 @@ function mem_offset_u(
307307
ind_by_offset = inds_calc_by_ptr_offset[n] | (ind === CONSTANTZEROINDEX)
308308
offset = convert(Int, offsets[n])
309309
stride = convert(Int, strides[n])
310+
# @show stride, ind, u₂loopsym
310311
indvectorized = ind === vloopsym
311312
indvectorizedmm = _mm & indvectorized
312313
if ind === u₁loopsym
314+
# TODO: should it be stride*incr₁ ???
313315
addvectoroffset!(ret, indvectorizedmm, incr₁, u₁step, vstep, stride, ind, offset, ind_by_offset, indvectorized) # 9 arg
314316
elseif ind === u₂loopsym
315-
addvectoroffset!(ret, indvectorizedmm, incr₂, u₂step, vstep, stride, ind, offset, ind_by_offset, indvectorized) # 9 arg
317+
addvectoroffset!(ret, indvectorizedmm, stride * incr₂, u₂step, vstep, stride, ind, offset, ind_by_offset, indvectorized) # 9 arg
316318
elseif loopedindex[n]
317319
addoffset!(ret, indvectorizedmm, vstep, stride, ind, offset, ind_by_offset) # 7 arg
318320
else

src/modeling/determinestrategy.jl

Lines changed: 37 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -771,40 +771,45 @@ end
771771
# mno, id
772772
# end
773773
function maxnegativeoffset(ls::LoopSet, op::Operation, u::Symbol)
774-
mno::Int = typemin(Int)
775-
id = 0
776-
isknown(step(getloop(ls, u))) || return mno, id
777-
omop = offsetloadcollection(ls)
778-
collectionid, opind = omop.opidcollectionmap[identifier(op)]
779-
collectionid == 0 && return mno, id
780-
@unpack opids = omop
774+
mno::Int = typemin(Int)
775+
id = 0
776+
isknown(step(getloop(ls, u))) || return mno, id
777+
omop = offsetloadcollection(ls)
778+
collectionid, opind = omop.opidcollectionmap[identifier(op)]
779+
collectionid == 0 && return mno, id
780+
@unpack opids = omop
781781

782-
# offsetcol = offsets[collectionid]
783-
opidcol = opids[collectionid]
784-
opid = identifier(op)
785-
# opoffs = offsetcol[opind]
786-
opoffs = getoffsets(op)
787-
ops = operations(ls)
788-
opindices = getindicesonly(op)
789-
for (i,oppid) enumerate(opidcol)
790-
opid == oppid && continue
791-
opp = ops[oppid]
792-
oppoffs = getoffsets(opp)
793-
mnonew::Int = typemin(Int)
794-
for i eachindex(opindices)
795-
if opindices[i] === u
796-
mnonew = ((opoffs[i] % Int) - (oppoffs[i] % Int))
797-
elseif opoffs[i] != oppoffs[i]
798-
mnonew = 1
799-
break
800-
end
801-
end
802-
if mno < mnonew < 0
803-
mno = mnonew
804-
id = identifier(opp)
805-
end
782+
opidcol = opids[collectionid]
783+
opid = identifier(op)
784+
opoffs = getoffsets(op)
785+
opstrd = getstrides(op)
786+
ops = operations(ls)
787+
opindices = getindicesonly(op)
788+
for oppid opidcol
789+
opid == oppid && continue
790+
opp = ops[oppid]
791+
oppoffs = getoffsets(opp)
792+
oppstrd = getstrides(opp)
793+
mnonew::Int = typemin(Int)
794+
for i eachindex(opindices)
795+
strd = opstrd[i]
796+
strd == oppstrd[i] == 1 || continue
797+
if opindices[i] === u
798+
mnonew = (opoffs[i] % Int) - (oppoffs[i] % Int)
799+
# mnonew_t, mnonew_rem = divrem((opoffs[i] % Int) - (oppoffs[i] % Int), strd % Int)
800+
# mnonew_rem == 0 || continue
801+
# mnonew = mnonew_t
802+
elseif opoffs[i] != oppoffs[i]
803+
mnonew = 1
804+
break
805+
end
806806
end
807-
mno, id
807+
if mno < mnonew < 0
808+
mno = mnonew
809+
id = identifier(opp)
810+
end
811+
end
812+
mno, id
808813
end
809814
function maxnegativeoffset(ls::LoopSet, op::Operation, unrollsyms::UnrollSymbols)
810815
@unpack u₁loopsym, u₂loopsym, vloopsym = unrollsyms

test/shuffleloadstores.jl

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,31 @@ function reverse_part_ref(n1,n2)
401401
return A
402402
end
403403

404+
405+
function tullio_issue_131_ref(arr)
406+
M, N = size(arr)
407+
out = zeros(M >>> 1, N >>> 1)
408+
@inbounds @fastmath for j in axes(out,2)
409+
for i in axes(out,1)
410+
out[i, j] = arr[2i, 2j] + arr[2i - 1, 2j] + arr[2i - 1, 2j - 1] + arr[2i, 2j - 1]
411+
end
412+
end
413+
out
414+
end
415+
416+
417+
function tullio_issue_131(arr)
418+
M, N = size(arr)
419+
out = zeros(M >>> 1, N >>> 1)
420+
@turbo for j in axes(out,2)
421+
for i in axes(out,1)
422+
out[i, j] = arr[2i, 2j] + arr[2i - 1, 2j] + arr[2i - 1, 2j - 1] + arr[2i, 2j - 1]
423+
end
424+
end
425+
out
426+
end
427+
428+
404429
@testset "shuffles load/stores" begin
405430
@show @__LINE__
406431
for i 1:128
@@ -425,8 +450,10 @@ end
425450
end
426451
@test qsimd Base.vect(qdot_affine(xqv, yqv)...) Base.vect(qdot_stride(xqv, yqv)...)
427452

428-
if VERSION v"1.6.0-rc1"
429-
for j max(1,i-5):i+5, k max(1,i-5,i+5)
453+
for j max(1,i-5):i+5, k max(1,i-5,i+5)
454+
A = rand(j+1, k);
455+
@test tullio_issue_131(A) tullio_issue_131_ref(A)
456+
if VERSION v"1.6.0-rc1"
430457
Ac = rand(Complex{Float64}, j, i);
431458
Bc = rand(Complex{Float64}, i, k);
432459
Cc1 = Ac*Bc;

0 commit comments

Comments
 (0)