Skip to content

Commit 8789ad1

Browse files
committed
Review changes
1 parent 4516f96 commit 8789ad1

File tree

2 files changed

+11
-31
lines changed

2 files changed

+11
-31
lines changed

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2392,23 +2392,23 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
23922392
Result = false;
23932393

23942394
else if (MI.isInlineAsm()) {
2395-
auto &TRI = TII->getRegisterInfo();
2395+
const SIRegisterInfo &TRI = TII->getRegisterInfo();
23962396
auto &MRI = MI.getParent()->getParent()->getRegInfo();
23972397
bool SGPR_used = false, VGPR_used = false, VMFMA_used = false,
23982398
MayLoad = MI.mayLoad(), MayStore = MI.mayStore();
23992399
for (const MachineOperand &Operand : MI.operands())
24002400
if (Operand.isReg()) {
24012401
auto &RegClass = *TRI.getRegClassForOperandReg(MRI, Operand);
2402-
if (TRI.isVGPRClass(&RegClass))
2402+
if (TRI.hasVGPRs(&RegClass))
24032403
VGPR_used = true;
2404-
if (TRI.isAGPRClass(&RegClass) || TRI.getRegSizeInBits(RegClass) > 128)
2404+
if (TRI.hasAGPRs(&RegClass) || TRI.getRegSizeInBits(RegClass) > 128) // > 128 bit registers are usually only used by MFMA instructions, so we're using that as a heuristic to guess the schedule group mask of the inline asm.
24052405
VMFMA_used = true;
2406-
if (TRI.isSGPRClass(&RegClass))
2406+
if (TRI.hasSGPRs(&RegClass))
24072407
SGPR_used = true;
24082408
}
24092409

24102410
unsigned long InlineAsmMask = 0;
2411-
if (VGPR_used && !SGPR_used && !VMFMA_used && !MayLoad && !MayStore)
2411+
if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore)
24122412
InlineAsmMask |= (unsigned long)SchedGroupMask::VALU;
24132413
if (SGPR_used && !MayLoad && !MayStore)
24142414
InlineAsmMask |= (unsigned long)SchedGroupMask::SALU;
@@ -2434,16 +2434,6 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
24342434

24352435
Result = ((unsigned long)SGMask & InlineAsmMask) != 0;
24362436

2437-
// Original implementation
2438-
#if 0
2439-
StringRef Text = MI.getOperand(0).getSymbolName();
2440-
if (Text.find("SGMASK:") != std::string::npos) {
2441-
Text = Text.substr(Text.find("SGMASK:") + strlen("SGMASK:"));
2442-
Text = Text.substr(0, Text.find_first_of(" \t\r\n"));
2443-
unsigned long InlineAsmMask = std::stoul(Text.str(), nullptr, 0);
2444-
Result = ((unsigned long)SGMask & InlineAsmMask) != 0;
2445-
}
2446-
#endif
24472437
}
24482438

24492439
else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&

llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,13 @@
1-
; RUN: llc -O3 -mcpu=gfx942 < %s | FileCheck %s
1+
; RUN: llc -mcpu=gfx942 < %s | FileCheck %s
22
; CHECK: v_add_f32_e32
33
; CHECK-NEXT: ;;#ASMSTART
44
; CHECK-NEXT: v_mfma_f64
55
; CHECK-NEXT: ;;#ASMEND
66
; CHECK: v_add_f32_e32
77
; ModuleID = '<stdin>'
8-
source_filename = "llvm-link"
9-
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
108
target triple = "amdgcn-amd-amdhsa"
119

1210
@llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_bffb86447932ec40 to ptr)], section "llvm.metadata"
13-
@__hip_cuid_bffb86447932ec40 = addrspace(1) global i8 0
1411

1512
; Function Attrs: convergent mustprogress norecurse nounwind
1613
define protected amdgpu_kernel void @_Z17group4_sum_floaatPfPKfi(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %length) local_unnamed_addr #0 {
@@ -28,18 +25,18 @@ if.then: ; preds = %entry
2825
%mul3 = shl nsw i32 %add, 2
2926
%idx.ext4 = sext i32 %mul3 to i64
3027
%add.ptr5 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext4
31-
%2 = load <2 x float>, ptr addrspace(1) %add.ptr5, align 16, !tbaa !0
28+
%2 = load <2 x float>, ptr addrspace(1) %add.ptr5, align 16
3229
%a20 = add i64 %idx.ext4, 2
3330
%a21 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %a20
34-
%a22 = load <2 x float>, ptr addrspace(1) %a21, align 16, !tbaa !0
31+
%a22 = load <2 x float>, ptr addrspace(1) %a21, align 16
3532
%3 = extractelement <2 x float> %a22, i64 1
3633
%4 = extractelement <2 x float> %2, i64 0
37-
%5 = tail call contract noundef float asm "v_mfma_f64_4x4x4f64 $0, $1, $2, 0", "=a,v,v"(<2 x float> %2, <2 x float> %a22) #3, !srcloc !3
34+
%5 = tail call contract noundef float asm "v_mfma_f64_4x4x4f64 $0, $1, $2, 0", "=a,v,v"(<2 x float> %2, <2 x float> %a22) #3
3835
%6 = extractelement <2 x float> %2, i64 1
3936
%7 = extractelement <2 x float> %a22, i64 0
4037
%add6 = fadd contract float %6, %7
4138
%add7 = fadd contract float %5, %add6
42-
store float %add7, ptr addrspace(1) %add.ptr, align 4, !tbaa !4
39+
store float %add7, ptr addrspace(1) %add.ptr, align 4
4340
tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0)
4441
tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 5, i32 0)
4542
tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0)
@@ -64,11 +61,4 @@ declare void @llvm.amdgcn.sched.group.barrier(i32 immarg, i32 immarg, i32 immarg
6461
attributes #0 = { convergent mustprogress norecurse nounwind "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx942" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" "uniform-work-group-size"="true" }
6562
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
6663
attributes #2 = { convergent nocallback nofree nounwind willreturn }
67-
attributes #3 = { convergent nounwind memory(none) }
68-
69-
!0 = !{!1, !1, i64 0}
70-
!1 = !{!"omnipotent char", !2, i64 0}
71-
!2 = !{!"Simple C++ TBAA"}
72-
!3 = !{i64 129}
73-
!4 = !{!5, !5, i64 0}
74-
!5 = !{!"float", !1, i64 0}
64+
attributes #3 = { convergent nounwind memory(none) }

0 commit comments

Comments
 (0)