Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1891,7 +1891,6 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
}
}

assert(Cache->size());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can return false?
There are other InstructionRule in MFMASmallGemmSingleWaveOpt that make a similar assert -- can you apply the same change in those places as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't think an explicit "return false" would be helpful - if Cache is empty at this point then the loop immediately below will do zero iterations and we'll hit the "return false" immediately below that.

The only other similar assert I found was the one in the next inner class down, MFMASmallGemmSingleWaveOpt::IsPermForDSW . Again this will return false if Cache is empty due to the llvm::any_of() call.

auto *DAG = SyncPipe[0].DAG;
for (auto &Elt : *Cache) {
if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
Expand Down Expand Up @@ -1928,8 +1927,6 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
return FitsInGroup;
}

assert(Cache->size());

// Does the VALU have a DS_WRITE successor that is the same as other
// VALU already in the group. The V_PERMs will all share 1 DS_W succ
return llvm::any_of(*Cache, [&SU](SUnit *Elt) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O1 -global-isel=true < %s

; Function Attrs: nounwind
define amdgpu_kernel void @test_iglp_opt() #0 {
entry:
call void @llvm.amdgcn.iglp.opt(i32 0) #4
ret void
}

; Function Attrs: nounwind
define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
entry:
call void @llvm.amdgcn.iglp.opt(i32 0)
%idx = call i32 @llvm.amdgcn.workitem.id.x()
%load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx
%load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr, align 128
%load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64
%load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr, align 128
%load.2.addr = getelementptr <32 x float>, ptr addrspace(3) %load.1.addr, i32 128
%load.2 = load <32 x float>, ptr addrspace(3) %load.2.addr, align 128
%load.3.addr = getelementptr <32 x float>, ptr addrspace(3) %load.2.addr, i32 192
%load.3 = load <32 x float>, ptr addrspace(3) %load.3.addr, align 128
%load.4.addr = getelementptr <32 x float>, ptr addrspace(3) %load.3.addr, i32 256
%load.4 = load <32 x float>, ptr addrspace(3) %load.4.addr, align 128
%mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.0, i32 0, i32 0, i32 0)
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.1, i32 0, i32 0, i32 0)
%mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.2, i32 0, i32 0, i32 0)
%mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.3, i32 0, i32 0, i32 0)
%mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.4, i32 0, i32 0, i32 0)
%store.0.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx
store <32 x float> %mai.0, ptr addrspace(3) %store.0.addr, align 128
%store.1.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 64
store <32 x float> %mai.1, ptr addrspace(3) %store.1.addr, align 128
%store.2.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 128
store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr, align 128
%store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192
store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr, align 128
%store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256
store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr, align 128
ret void
}

; Function Attrs: nounwind
define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
entry:
call void @llvm.amdgcn.iglp.opt(i32 1)
%idx = call i32 @llvm.amdgcn.workitem.id.x()
%load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx
%load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr, align 128
%load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64
%load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr, align 128
%load.2.addr = getelementptr <32 x float>, ptr addrspace(3) %load.1.addr, i32 128
%load.2 = load <32 x float>, ptr addrspace(3) %load.2.addr, align 128
%load.3.addr = getelementptr <32 x float>, ptr addrspace(3) %load.2.addr, i32 192
%L = load <1 x i64>, ptr addrspace(3) %load.3.addr, align 8
%load.3 = load <32 x float>, ptr addrspace(3) %load.3.addr, align 128
%load.4.addr = getelementptr <32 x float>, ptr addrspace(3) %load.3.addr, i32 256
%L1 = load <1 x i64>, ptr addrspace(3) %load.4.addr, align 8
%load.4 = load <32 x float>, ptr addrspace(3) %load.4.addr, align 128
%B = urem <1 x i64> %L, %L1
%mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.0, i32 0, i32 0, i32 0)
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.1, i32 0, i32 0, i32 0)
%mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.2, i32 0, i32 0, i32 0)
%mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.3, i32 0, i32 0, i32 0)
%mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %load.4, i32 0, i32 0, i32 0)
%store.0.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx
store <32 x float> %mai.0, ptr addrspace(3) %store.0.addr, align 128
%store.1.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 64
store <32 x float> %mai.1, ptr addrspace(3) %store.1.addr, align 128
%store.2.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 128
store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr, align 128
%store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192
store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr, align 128
%store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256
store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr, align 128
store <1 x i64> %B, ptr addrspace(3) %store.3.addr, align 8
ret void
}

; Function Attrs: convergent nocallback nofree nounwind willreturn
declare void @llvm.amdgcn.iglp.opt(i32 immarg) #1

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.amdgcn.workitem.id.x() #2

; Function Attrs: convergent nocallback nofree nosync nounwind willreturn memory(none)
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32 immarg, i32 immarg, i32 immarg) #3

attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" }
attributes #1 = { convergent nocallback nofree nounwind willreturn }
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #3 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
attributes #4 = { convergent nounwind }
Loading