Skip to content

Commit 68b749f

Browse files
committed
Testcases and bugfix
1 parent 4a5f90e commit 68b749f

File tree

2 files changed

+80
-3
lines changed

2 files changed

+80
-3
lines changed

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2402,7 +2402,7 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
24022402
if (TRI.hasVGPRs(&RegClass)) {
24032403
VGPR_used = true;
24042404
if (Operand.isUse() && TRI.getRegSizeInBits(RegClass) == 32)
2405-
VReg32_used = false;
2405+
VReg32_used = true;
24062406
}
24072407
// >= 128 bit registers are usually only used by MFMA instructions, so
24082408
// we're using that as a heuristic to guess the schedule group mask of

llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll

Lines changed: 79 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
; RUN: llc -mcpu=gfx942 < %s | FileCheck %s
2+
; CHECK-LABEL: test_mfma
23
; CHECK: v_add_f32_e32
34
; CHECK-NEXT: ;;#ASMSTART
45
; CHECK-NEXT: v_mfma_f64
@@ -8,7 +9,43 @@
89
target triple = "amdgcn-amd-amdhsa"
910

1011
; Function Attrs: convergent mustprogress norecurse nounwind
11-
define protected amdgpu_kernel void @_Z17group4_sum_floaatPfPKfi(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %length) local_unnamed_addr #0 {
12+
define protected amdgpu_kernel void @test_valu(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %k, ptr addrspace(1) noundef writeonly captures(none) %ret.coerce, i32 noundef %length) local_unnamed_addr #0 {
13+
entry:
14+
%0 = tail call i32 @llvm.amdgcn.workgroup.id.x()
15+
%mul = shl i32 %0, 6
16+
%1 = tail call i32 @llvm.amdgcn.workitem.id.x()
17+
%add = add i32 %mul, %1
18+
%cmp = icmp slt i32 %add, %length
19+
br i1 %cmp, label %if.then, label %if.end
20+
21+
if.then: ; preds = %entry
22+
%idx.ext = sext i32 %add to i64
23+
%add.ptr = getelementptr inbounds float, ptr addrspace(1) %to.coerce, i64 %idx.ext
24+
%mul4 = shl nsw i32 %add, 2
25+
%idx.ext5 = sext i32 %mul4 to i64
26+
%add.ptr6 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext5
27+
%2 = load <4 x float>, ptr addrspace(1) %add.ptr6, align 16
28+
%3 = extractelement <4 x float> %2, i64 3
29+
%4 = extractelement <4 x float> %2, i64 0
30+
%5 = tail call contract noundef float asm "v_add_f32_e32 $0, $1, $2", "=v,v,v"(float %3, float %4) #3
31+
%6 = extractelement <4 x float> %2, i64 1
32+
%7 = extractelement <4 x float> %2, i64 2
33+
%add7 = fadd contract float %6, %7
34+
%add8 = fadd contract float %5, %add7
35+
store float %add8, ptr addrspace(1) %add.ptr, align 4
36+
%mul9 = mul nsw i32 %k, 3
37+
store i32 %mul9, ptr addrspace(1) %ret.coerce, align 4
38+
tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 6, i32 0)
39+
tail call void @llvm.amdgcn.sched.group.barrier(i32 4, i32 1, i32 0)
40+
tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 4, i32 0)
41+
br label %if.end
42+
43+
if.end: ; preds = %if.then, %entry
44+
ret void
45+
}
46+
47+
; Function Attrs: convergent mustprogress norecurse nounwind
48+
define protected amdgpu_kernel void @test_mfma(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %length) local_unnamed_addr #0 {
1249
entry:
1350
%0 = tail call i32 @llvm.amdgcn.workgroup.id.x()
1451
%mul = shl i32 %0, 6
@@ -47,6 +84,45 @@ if.end: ; preds = %if.then, %entry
4784
ret void
4885
}
4986

87+
@_ZZ16group4_sum_floatPfPKfE6cpymem = internal addrspace(3) global [8 x float] undef, align 16
88+
89+
; Function Attrs: convergent mustprogress norecurse nounwind
90+
define protected amdgpu_kernel void @test_ds(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %length) local_unnamed_addr #0 {
91+
entry:
92+
%0 = tail call i32 @llvm.amdgcn.workgroup.id.x()
93+
%mul = shl i32 %0, 6
94+
%1 = tail call i32 @llvm.amdgcn.workitem.id.x()
95+
%add = add i32 %mul, %1
96+
%cmp = icmp slt i32 %add, %length
97+
br i1 %cmp, label %if.then, label %if.end
98+
99+
if.then: ; preds = %entry
100+
%idx.ext = sext i32 %add to i64
101+
%add.ptr = getelementptr inbounds float, ptr addrspace(1) %to.coerce, i64 %idx.ext
102+
%mul3 = shl nsw i32 %add, 2
103+
%idx.ext4 = sext i32 %mul3 to i64
104+
%add.ptr5 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext4
105+
%2 = load <2 x float>, ptr addrspace(1) %add.ptr5, align 16
106+
%a20 = add i64 %idx.ext4, 2
107+
%a21 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %a20
108+
%a22 = load <2 x float>, ptr addrspace(1) %a21, align 16
109+
%3 = extractelement <2 x float> %a22, i64 1
110+
%4 = extractelement <2 x float> %2, i64 0
111+
%5 = tail call contract noundef float asm "ds_read_b32 $0, $1 offset:0", "=v,v,~{memory}"(i32 ptrtoint (ptr addrspacecast (ptr addrspace(3) @_ZZ16group4_sum_floatPfPKfE6cpymem to ptr) to i32)) #4
112+
%6 = extractelement <2 x float> %2, i64 1
113+
%7 = extractelement <2 x float> %a22, i64 0
114+
%add6 = fadd contract float %6, %7
115+
%add7 = fadd contract float %5, %add6
116+
store float %add7, ptr addrspace(1) %add.ptr, align 4
117+
tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 7, i32 0)
118+
tail call void @llvm.amdgcn.sched.group.barrier(i32 128, i32 1, i32 0)
119+
tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0)
120+
br label %if.end
121+
122+
if.end: ; preds = %if.then, %entry
123+
ret void
124+
}
125+
50126
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
51127
declare noundef i32 @llvm.amdgcn.workgroup.id.x() #1
52128

@@ -56,7 +132,8 @@ declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1
56132
; Function Attrs: convergent nocallback nofree nounwind willreturn
57133
declare void @llvm.amdgcn.sched.group.barrier(i32 immarg, i32 immarg, i32 immarg) #2
58134

59-
attributes #0 = { convergent mustprogress norecurse nounwind "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx942" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" "uniform-work-group-size"="true" }
135+
attributes #0 = { convergent mustprogress norecurse nounwind "amdgpu-agpr-alloc"="1" "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx942" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" "uniform-work-group-size"="true" }
60136
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
61137
attributes #2 = { convergent nocallback nofree nounwind willreturn }
62138
attributes #3 = { convergent nounwind memory(none) }
139+
attributes #4 = { convergent nounwind }

0 commit comments

Comments
 (0)