Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
aec2e4f
Initial commit
justinfargnoli Jul 1, 2024
33efd19
Prefer early return
justinfargnoli Jul 8, 2024
c557840
clang-format
justinfargnoli Jul 8, 2024
7212420
Add negative test for `ISD::TRUNCATE`
justinfargnoli Jul 8, 2024
01ad890
Add cvt and cvt_not test
justinfargnoli Jul 12, 2024
a015d23
Prefer early return
justinfargnoli Jul 12, 2024
18bc242
clang-format
justinfargnoli Jul 12, 2024
04f1ec4
Correct trunc_cvt_not test
justinfargnoli Jul 12, 2024
289fd4b
Delete NVPTXISelLowering implementation
justinfargnoli Jul 15, 2024
f9b01f1
Perform optimziation via DAGCombine + TLI
justinfargnoli Jul 15, 2024
e6b2e1e
Update variadics-backend.ll
justinfargnoli Jul 16, 2024
4d1cc02
Save work
justinfargnoli Jul 17, 2024
8f5a351
Modify boolean-patterns.ll
justinfargnoli Jul 17, 2024
7cc72b3
Revert "Save work"
justinfargnoli Jul 17, 2024
0d44023
Save work
justinfargnoli Jul 17, 2024
f0777f3
Remove improper TLI use
justinfargnoli Jul 17, 2024
6d603cf
Remove white space
justinfargnoli Jul 17, 2024
e5be174
clang-format
justinfargnoli Jul 17, 2024
058c35e
Implement transform in DAG combine
justinfargnoli Jul 17, 2024
3ea5867
Save work
justinfargnoli Jul 22, 2024
b2afe79
Remove isTruncateFree
justinfargnoli Jul 22, 2024
04ae5d4
Use ptr in test
justinfargnoli Jul 22, 2024
830442a
Only run on free truncs
justinfargnoli Jul 22, 2024
401c847
Add comment
justinfargnoli Jul 22, 2024
6bec31d
clang-format
justinfargnoli Jul 22, 2024
bf0620d
explcitly mentino GPU in comment
justinfargnoli Jul 22, 2024
3e7aa00
clang-format
justinfargnoli Jul 22, 2024
e8046cb
Add comment to TLI function
justinfargnoli Jul 22, 2024
0d58daf
clang-format
justinfargnoli Jul 22, 2024
09b2394
Update boolean-patterns.ll
justinfargnoli Jul 22, 2024
30c5993
Use isNarrowingProfitable instead of shouldReduceRegisterPressure
justinfargnoli Jul 23, 2024
9cd0dee
clang-format
justinfargnoli Jul 23, 2024
efff174
Update tests
justinfargnoli Sep 12, 2024
0c33486
Update more tests
justinfargnoli Sep 12, 2024
41df970
Update even more tests
justinfargnoli Sep 12, 2024
b1eb61a
Narrowing vectors isn't profitable | Update an extraordinary number o…
justinfargnoli Sep 12, 2024
939885f
Changes with additional mov instructions
justinfargnoli Sep 14, 2024
f0f4ad0
Changes with non-trivial diffs
justinfargnoli Sep 14, 2024
3767c76
Changes with a higher instruction count
justinfargnoli Sep 14, 2024
eef0dbc
Non-trivial diff
justinfargnoli Sep 15, 2024
4d804df
Prefer readable condition
justinfargnoli Sep 15, 2024
6c2ae42
Update 2 X86 tests that I missed
justinfargnoli Sep 15, 2024
e3f8d9f
Update register names for NVPTX/boolean-patterns.ll
justinfargnoli Sep 15, 2024
13edcaa
Move isTypeLegal condition up
justinfargnoli Sep 16, 2024
a52b459
Update AMDGPU/computeNumSignBits-mul.ll
justinfargnoli Sep 19, 2024
06a0e68
Address comment
justinfargnoli Sep 20, 2024
401bb07
Prevent infinite loop by checking TLI.IsDesirableToPromoteOp()
justinfargnoli Sep 24, 2024
d887b68
Fixup previous commit
justinfargnoli Sep 24, 2024
5811e11
clang-format
justinfargnoli Sep 24, 2024
0e931f3
Use new isNarrowingProfitable API
justinfargnoli Sep 24, 2024
281f897
Update tests on ToT
justinfargnoli Sep 24, 2024
f1fee8e
Update NVPTX isNarrowingProfitable API
justinfargnoli Sep 24, 2024
47fadcd
Update AMDGPU tests
justinfargnoli Sep 24, 2024
9ed8f93
Pass the correct SDNode to isNarrowingProfitable in DAGCombiner
justinfargnoli Sep 24, 2024
2aa5bbc
Update AMDGPU tests
justinfargnoli Sep 24, 2024
2f9995f
Update amdgpu-codegenprepare-idiv.ll
justinfargnoli Sep 25, 2024
1d681b9
Update NVPTX tests
justinfargnoli Oct 31, 2024
bbf1635
Address comments
justinfargnoli Oct 31, 2024
a6c2465
[AMDGPU] Cleanup xor.ll
justinfargnoli Nov 1, 2024
7f5bd91
[AMDGPU] revert xor.ll si-annotate-cf.ll changes
justinfargnoli Nov 1, 2024
5ab861f
[AMDGPU] Revert and.ll changes
justinfargnoli Nov 1, 2024
6dfabf6
[AMDGPU] It's not profitable to narrow to i1
justinfargnoli Nov 1, 2024
d3b68b1
Fix rebase
justinfargnoli May 6, 2025
53965f2
Fix AMD infinite loop
justinfargnoli May 6, 2025
99da17d
Update tests
justinfargnoli May 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 33 additions & 2 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6036,6 +6036,10 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
HandOpcode == ISD::ANY_EXTEND_VECTOR_INREG) &&
LegalTypes && !TLI.isTypeDesirableForOp(LogicOpcode, XVT))
return SDValue();
// Prevent an infinite loop if the target prefers the inverse
// transformation.
if (TLI.isNarrowingProfitable(N, XVT, VT))
return SDValue();
// logic_op (hand_op X), (hand_op Y) --> hand_op (logic_op X, Y)
SDNodeFlags LogicFlags;
LogicFlags.setDisjoint(N->getFlags().hasDisjoint() &&
Expand All @@ -6048,6 +6052,9 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {

// logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
if (HandOpcode == ISD::TRUNCATE) {
// Don't create a logic op on an illegal type.
if (!TLI.isTypeLegal(XVT))
return SDValue();
// If both operands have other uses, this transform would create extra
// instructions without eliminating anything.
if (!N0.hasOneUse() && !N1.hasOneUse())
Expand All @@ -6059,10 +6066,12 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
if (LegalOperations && !TLI.isOperationLegal(LogicOpcode, XVT))
return SDValue();
// Be extra careful sinking truncate. If it's free, there's no benefit in
// widening a binop. Also, don't create a logic op on an illegal type.
// widening a binop.
if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT))
return SDValue();
if (!TLI.isTypeLegal(XVT))
// Prevent an infinite loop if the target prefers the inverse
// transformation.
if (TLI.isNarrowingProfitable(N, XVT, VT))
return SDValue();
SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
return DAG.getNode(HandOpcode, DL, VT, Logic);
Expand Down Expand Up @@ -15869,6 +15878,28 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
break;
}

if (!LegalOperations || TLI.isOperationLegal(N0.getOpcode(), VT)) {
switch (N0.getOpcode()) {
case ISD::ADD:
case ISD::SUB:
case ISD::MUL:
case ISD::AND:
case ISD::OR:
case ISD::XOR:
if (!N0.hasOneUse() || !VT.isScalarInteger())
break;
if (!TLI.isNarrowingProfitable(N0.getNode(), SrcVT, VT))
break;
SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
SDValue TruncatedOp =
DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
if (TLI.IsDesirableToPromoteOp(TruncatedOp, SrcVT))
break;
Comment on lines +15895 to +15898
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needing to speculatively create some nodes in case it's profitable is unfortunate, is there a way to avoid this

return TruncatedOp;
}
}

return SDValue();
}

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,8 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
case ISD::MUL:
case ISD::SETCC:
case ISD::SELECT:
if (DestVT.getScalarSizeInBits() == 1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

getScalarType() == i1

return false;
if (Subtarget->has16BitInsts() &&
(!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) {
// Don't narrow back down to i16 if promoted to i32 already.
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,14 @@ class NVPTXTargetLowering : public TargetLowering {
DstTy->getPrimitiveSizeInBits() == 32;
}

bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override {
// Truncating 64-bit to 32-bit is free in SASS.
if (!SrcVT.isScalarInteger() || !DestVT.isScalarInteger())
return false;
return SrcVT.getFixedSizeInBits() == 64 &&
DestVT.getFixedSizeInBits() == 32;
}

EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
EVT VT) const override {
if (VT.isVector())
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1438,8 +1438,8 @@ def : Pat<(mul i1:$a, imm:$b), (ANDb1ri $a, imm:$b)>;
// These transformations were once reliably performed by instcombine, but thanks
// to poison semantics they are no longer safe for LLVM IR, perform them here
// instead.
def : Pat<(select i1:$a, i1:$b, 0), (ANDb1rr $a, $b)>;
def : Pat<(select i1:$a, 1, i1:$b), (ORb1rr $a, $b)>;
def : Pat<(select Int1Regs:$a, Int1Regs:$b, 0), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
def : Pat<(select Int1Regs:$a, 1, Int1Regs:$b), (ORb1rr Int1Regs:$a, Int1Regs:$b)>;

// Lower logical v2i16/v4i8 ops as bitwise ops on b32.
foreach vt = [v2i16, v4i8] in {
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35425,6 +35425,8 @@ bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
bool X86TargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
EVT DestVT) const {
// i16 instructions are longer (0x66 prefix) and potentially slower.
if (!(SrcVT.isScalarInteger() && DestVT.isScalarInteger()))
return false;
return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
}

Expand Down
67 changes: 67 additions & 0 deletions llvm/test/CodeGen/AMDGPU/add_i1.ll
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
Expand All @@ -6,6 +7,20 @@
; GFX9: v_xor_b32_e32
; GFX10: v_xor_b32_e32
define amdgpu_kernel void @add_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
; GFX9-LABEL: add_var_var_i1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
%a = load volatile i1, ptr addrspace(1) %in0
%b = load volatile i1, ptr addrspace(1) %in1
%add = add i1 %a, %b
Expand All @@ -17,6 +32,17 @@ define amdgpu_kernel void @add_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1
; GFX9: s_xor_b64
; GFX10: s_xor_b32
define amdgpu_kernel void @add_var_imm_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX9-LABEL: add_var_imm_i1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_not_b32_e32 v1, v1
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
%a = load volatile i1, ptr addrspace(1) %in
%add = add i1 %a, 1
store i1 %add, ptr addrspace(1) %out
Expand All @@ -28,6 +54,44 @@ define amdgpu_kernel void @add_var_imm_i1(ptr addrspace(1) %out, ptr addrspace(1
; GFX9: s_xor_b64
; GFX10: s_xor_b32
define amdgpu_kernel void @add_i1_cf(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
; GFX9-LABEL: add_i1_cf:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0
; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; GFX9-NEXT: s_cbranch_execz .LBB2_2
; GFX9-NEXT: ; %bb.1: ; %else
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; GFX9-NEXT: .LBB2_2: ; %Flow
; GFX9-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
; GFX9-NEXT: s_cbranch_execz .LBB2_4
; GFX9-NEXT: ; %bb.3: ; %if
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_andn2_b64 s[2:3], s[4:5], exec
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX9-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5]
; GFX9-NEXT: .LBB2_4: ; %endif
; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
; GFX9-NEXT: v_not_b32_e32 v1, v1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%d_cmp = icmp ult i32 %tid, 16
Expand All @@ -49,3 +113,6 @@ endif:
}

declare i32 @llvm.amdgcn.workitem.id.x()
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
; GFX10: {{.*}}
72 changes: 42 additions & 30 deletions llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
Original file line number Diff line number Diff line change
Expand Up @@ -565,8 +565,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: s_cbranch_vccz .LBB3_12
; GFX908-NEXT: .LBB3_2: ; %bb9
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB3_5 Depth 2
; GFX908-NEXT: s_mov_b64 s[18:19], -1
; GFX908-NEXT: ; Child Loop BB3_6 Depth 2
; GFX908-NEXT: s_mov_b64 s[22:23], -1
; GFX908-NEXT: s_mov_b64 vcc, s[0:1]
; GFX908-NEXT: s_cbranch_vccz .LBB3_10
; GFX908-NEXT: ; %bb.3: ; %bb14
Expand Down Expand Up @@ -597,18 +597,25 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: s_add_i32 s13, s22, s13
; GFX908-NEXT: s_mul_i32 s9, s6, s9
; GFX908-NEXT: s_add_i32 s13, s13, s23
; GFX908-NEXT: s_branch .LBB3_5
; GFX908-NEXT: s_branch .LBB3_6
; GFX908-NEXT: .LBB3_4: ; %bb58
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
; GFX908-NEXT: ; in Loop: Header=BB3_6 Depth=2
; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX908-NEXT: s_add_u32 s20, s20, s4
; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3]
; GFX908-NEXT: s_addc_u32 s21, s21, s5
; GFX908-NEXT: s_mov_b64 s[22:23], 0
; GFX908-NEXT: .LBB3_5: ; %Flow18
; GFX908-NEXT: ; in Loop: Header=BB3_6 Depth=2
; GFX908-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[22:23]
; GFX908-NEXT: v_readfirstlane_b32 s22, v12
; GFX908-NEXT: s_not_b32 s22, s22
; GFX908-NEXT: s_bitcmp1_b32 s22, 0
; GFX908-NEXT: s_cselect_b64 s[22:23], -1, 0
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25]
; GFX908-NEXT: s_cbranch_vccz .LBB3_9
; GFX908-NEXT: .LBB3_5: ; %bb16
; GFX908-NEXT: s_cbranch_vccz .LBB3_10
; GFX908-NEXT: .LBB3_6: ; %bb16
; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: s_add_u32 s22, s20, s9
Expand All @@ -625,9 +632,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: ds_read_b64 v[14:15], v0
; GFX908-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_cbranch_vccnz .LBB3_7
; GFX908-NEXT: ; %bb.6: ; %bb51
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
; GFX908-NEXT: s_cbranch_vccnz .LBB3_8
; GFX908-NEXT: ; %bb.7: ; %bb51
; GFX908-NEXT: ; in Loop: Header=BB3_6 Depth=2
; GFX908-NEXT: v_cvt_f32_f16_sdwa v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX908-NEXT: v_cvt_f32_f16_e32 v21, v21
; GFX908-NEXT: v_cvt_f32_f16_sdwa v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
Expand All @@ -649,21 +656,20 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: v_add_f32_e32 v10, v10, v12
; GFX908-NEXT: v_add_f32_e32 v11, v11, v13
; GFX908-NEXT: s_branch .LBB3_4
; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
; GFX908-NEXT: .LBB3_8: ; in Loop: Header=BB3_6 Depth=2
; GFX908-NEXT: s_mov_b64 s[22:23], s[18:19]
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23]
; GFX908-NEXT: s_cbranch_vccz .LBB3_4
; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: ; %bb.9: ; in Loop: Header=BB3_6 Depth=2
; GFX908-NEXT: s_mov_b64 s[22:23], -1
; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX908-NEXT: ; implicit-def: $sgpr20_sgpr21
; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: s_xor_b64 s[18:19], s[22:23], -1
; GFX908-NEXT: s_mov_b64 s[24:25], -1
; GFX908-NEXT: s_branch .LBB3_5
; GFX908-NEXT: .LBB3_10: ; %Flow19
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: s_mov_b64 s[2:3], -1
; GFX908-NEXT: s_and_b64 vcc, exec, s[18:19]
; GFX908-NEXT: s_and_b64 vcc, exec, s[22:23]
; GFX908-NEXT: s_cbranch_vccz .LBB3_1
; GFX908-NEXT: ; %bb.11: ; %bb12
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
Expand Down Expand Up @@ -730,8 +736,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: s_cbranch_vccz .LBB3_12
; GFX90A-NEXT: .LBB3_2: ; %bb9
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB3_5 Depth 2
; GFX90A-NEXT: s_mov_b64 s[18:19], -1
; GFX90A-NEXT: ; Child Loop BB3_6 Depth 2
; GFX90A-NEXT: s_mov_b64 s[22:23], -1
; GFX90A-NEXT: s_mov_b64 vcc, s[0:1]
; GFX90A-NEXT: s_cbranch_vccz .LBB3_10
; GFX90A-NEXT: ; %bb.3: ; %bb14
Expand All @@ -758,18 +764,25 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: s_add_i32 s13, s22, s13
; GFX90A-NEXT: s_mul_i32 s9, s6, s9
; GFX90A-NEXT: s_add_i32 s13, s13, s23
; GFX90A-NEXT: s_branch .LBB3_5
; GFX90A-NEXT: s_branch .LBB3_6
; GFX90A-NEXT: .LBB3_4: ; %bb58
; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
; GFX90A-NEXT: ; in Loop: Header=BB3_6 Depth=2
; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX90A-NEXT: s_add_u32 s20, s20, s4
; GFX90A-NEXT: s_addc_u32 s21, s21, s5
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5]
; GFX90A-NEXT: s_mov_b64 s[22:23], 0
; GFX90A-NEXT: .LBB3_5: ; %Flow18
; GFX90A-NEXT: ; in Loop: Header=BB3_6 Depth=2
; GFX90A-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[22:23]
; GFX90A-NEXT: v_readfirstlane_b32 s22, v14
; GFX90A-NEXT: s_not_b32 s22, s22
; GFX90A-NEXT: s_bitcmp1_b32 s22, 0
; GFX90A-NEXT: s_cselect_b64 s[22:23], -1, 0
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25]
; GFX90A-NEXT: s_cbranch_vccz .LBB3_9
; GFX90A-NEXT: .LBB3_5: ; %bb16
; GFX90A-NEXT: s_cbranch_vccz .LBB3_10
; GFX90A-NEXT: .LBB3_6: ; %bb16
; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: s_add_u32 s22, s20, s9
Expand All @@ -787,9 +800,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX90A-NEXT: ; kill: killed $sgpr22 killed $sgpr23
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7
; GFX90A-NEXT: ; %bb.6: ; %bb51
; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
; GFX90A-NEXT: s_cbranch_vccnz .LBB3_8
; GFX90A-NEXT: ; %bb.7: ; %bb51
; GFX90A-NEXT: ; in Loop: Header=BB3_6 Depth=2
; GFX90A-NEXT: v_cvt_f32_f16_sdwa v23, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX90A-NEXT: v_cvt_f32_f16_e32 v22, v21
; GFX90A-NEXT: v_cvt_f32_f16_sdwa v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
Expand All @@ -803,21 +816,20 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[16:17]
; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15]
; GFX90A-NEXT: s_branch .LBB3_4
; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
; GFX90A-NEXT: .LBB3_8: ; in Loop: Header=BB3_6 Depth=2
; GFX90A-NEXT: s_mov_b64 s[22:23], s[18:19]
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23]
; GFX90A-NEXT: s_cbranch_vccz .LBB3_4
; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT: ; %bb.9: ; in Loop: Header=BB3_6 Depth=2
; GFX90A-NEXT: s_mov_b64 s[22:23], -1
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $sgpr20_sgpr21
; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT: s_xor_b64 s[18:19], s[22:23], -1
; GFX90A-NEXT: s_mov_b64 s[24:25], -1
; GFX90A-NEXT: s_branch .LBB3_5
; GFX90A-NEXT: .LBB3_10: ; %Flow19
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT: s_mov_b64 s[2:3], -1
; GFX90A-NEXT: s_and_b64 vcc, exec, s[18:19]
; GFX90A-NEXT: s_and_b64 vcc, exec, s[22:23]
; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
; GFX90A-NEXT: ; %bb.11: ; %bb12
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
Expand Down
Loading
Loading