diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d3df43473013e..f369136f99ca8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -13529,6 +13529,78 @@ SDValue DAGCombiner::visitSETCC(SDNode *N) { SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); SDLoc DL(N); + // Detect impossible conditions using known bits analysis. + if (ConstantSDNode *N1C = dyn_cast(N1)) { + const APInt &C1 = N1C->getAPIntValue(); + KnownBits KnownRHS = KnownBits::makeConstant(C1); + + // Bail out early if RHS is unknown (shouldn't happen for constants) + if (KnownRHS.isUnknown()) + return SDValue(); + + std::optional KnownVal; + + // Handle special cases first (like GlobalISel does) + if (KnownRHS.isZero()) { + // x >=u 0 -> always true + // x always false + if (Cond == ISD::SETUGE) + KnownVal = true; + else if (Cond == ISD::SETULT) + KnownVal = false; + } + + if (!KnownVal) { + bool SupportedPredicate = true; + KnownBits KnownLHS = DAG.computeKnownBits(N0); + + // Convert ISD::CondCode to CmpInst::Predicate + CmpInst::Predicate Pred; + switch (Cond) { + case ISD::SETEQ: + Pred = CmpInst::ICMP_EQ; + break; + case ISD::SETNE: + Pred = CmpInst::ICMP_NE; + break; + case ISD::SETULT: + Pred = CmpInst::ICMP_ULT; + break; + case ISD::SETULE: + Pred = CmpInst::ICMP_ULE; + break; + case ISD::SETUGT: + Pred = CmpInst::ICMP_UGT; + break; + case ISD::SETUGE: + Pred = CmpInst::ICMP_UGE; + break; + case ISD::SETLT: + Pred = CmpInst::ICMP_SLT; + break; + case ISD::SETLE: + Pred = CmpInst::ICMP_SLE; + break; + case ISD::SETGT: + Pred = CmpInst::ICMP_SGT; + break; + case ISD::SETGE: + Pred = CmpInst::ICMP_SGE; + break; + default: + SupportedPredicate = false; + break; + } + + if (SupportedPredicate) + KnownVal = ICmpInst::compare(KnownLHS, KnownRHS, Pred); + } + + // If the comparison result is known, replace with constant + if (KnownVal) + return DAG.getBoolConstant(*KnownVal, DL, VT, N1.getValueType()); + } + if (SDValue Combined = SimplifySetCC(VT, N0, N1, Cond, DL, !PreferSetCC)) { // If we prefer to have a setcc, and we don't, we'll try our best to // recreate one using rebuildSetCC. diff --git a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll index 113eb14ca4803..a2d178166bb6e 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll @@ -20,10 +20,7 @@ entry: define i8 @test2(i32 %a) { ; CHECK-LABEL: test2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #135 // =0x87 -; CHECK-NEXT: and w8, w0, w8 -; CHECK-NEXT: cmp w8, #1024 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret entry: %and = and i32 %a, 135 @@ -68,10 +65,7 @@ entry: define i8 @test5(i64 %a) { ; CHECK-LABEL: test5: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: and x8, x0, #0x3ffffc000 -; CHECK-NEXT: and x8, x8, #0xfffffffe00007fff -; CHECK-NEXT: cmp x8, #1024 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret entry: %and = and i64 %a, 8589950976 @@ -84,10 +78,7 @@ entry: define i8 @test6(i64 %a) { ; CHECK-LABEL: test6: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #135 // =0x87 -; CHECK-NEXT: and x8, x0, x8 -; CHECK-NEXT: cmp x8, #1024 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret entry: %and = and i64 %a, 135 @@ -252,10 +243,7 @@ entry: define i8 @test11(i64 %a) { ; CHECK-LABEL: test11: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #-1610612736 // =0xa0000000 -; CHECK-NEXT: and x8, x0, x8 -; CHECK-NEXT: cmp x8, #1024 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret entry: %and = and i64 %a, 2684354560 diff --git a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll index 4a73b10811d29..19a2babaffcb2 100644 --- a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll @@ -326,12 +326,7 @@ define i1 @scalar_i8_bitsinmiddle_slt(i8 %x, i8 %y) nounwind { define i1 @scalar_i8_signbit_eq_with_nonzero(i8 %x, i8 %y) nounwind { ; CHECK-LABEL: scalar_i8_signbit_eq_with_nonzero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-128 // =0xffffff80 -; CHECK-NEXT: lsl w8, w8, w1 -; CHECK-NEXT: and w8, w8, w0 -; CHECK-NEXT: and w8, w8, #0x80 -; CHECK-NEXT: cmp w8, #1 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret %t0 = shl i8 128, %y %t1 = and i8 %t0, %x diff --git a/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll b/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll index 33c5ba7987974..8240ce073a6de 100644 --- a/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll +++ b/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll @@ -95,9 +95,7 @@ define i1 @lt64_u64(i64 %0) { define i1 @lt8_u16_and_5(i8 %0) { ; CHECK-LABEL: lt8_u16_and_5: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret %2 = and i8 %0, 5 %3 = icmp ult i8 %2, 16 @@ -118,9 +116,7 @@ define i1 @lt8_u16_and_19(i8 %0) { define i1 @lt32_u16_and_7(i32 %0) { ; CHECK-LABEL: lt32_u16_and_7: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret %2 = and i32 %0, 7 %3 = icmp ult i32 %2, 16 @@ -141,9 +137,7 @@ define i1 @lt32_u16_and_21(i32 %0) { define i1 @lt64_u16_and_9(i64 %0) { ; CHECK-LABEL: lt64_u16_and_9: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: cmp x8, #0 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret %2 = and i64 %0, 9 %3 = icmp ult i64 %2, 16 diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll index 15cda622b902d..a8e0397594e5d 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -21,6 +21,46 @@ ; GCN: ds_write_b32 v0, v0 define void @func_mov_fi_i32() #0 { +; CI-LABEL: func_mov_fi_i32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshr_b32_e64 v0, s32, 6 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-MUBUF-LABEL: func_mov_fi_i32: +; GFX9-MUBUF: ; %bb.0: +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32 +; GFX9-MUBUF-NEXT: ds_write_b32 v0, v0 +; GFX9-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: func_mov_fi_i32: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s32 +; GFX9-FLATSCR-NEXT: ds_write_b32 v0, v0 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: func_mov_fi_i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s32 +; GFX11-TRUE16-NEXT: ds_store_b32 v0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: func_mov_fi_i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s32 +; GFX11-FAKE16-NEXT: ds_store_b32 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, addrspace(5) store volatile ptr addrspace(5) %alloca, ptr addrspace(3) poison ret void @@ -46,6 +86,61 @@ define void @func_mov_fi_i32() #0 { ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, [[ADD]] ; GFX9-NEXT: ds_write_b32 v0, v0 define void @func_mov_fi_i32_offset() #0 { +; CI-LABEL: func_mov_fi_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshr_b32_e64 v0, s32, 6 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: v_lshr_b32_e64 v0, s32, 6 +; CI-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-MUBUF-LABEL: func_mov_fi_i32_offset: +; GFX9-MUBUF: ; %bb.0: +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32 +; GFX9-MUBUF-NEXT: ds_write_b32 v0, v0 +; GFX9-MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX9-MUBUF-NEXT: ds_write_b32 v0, v0 +; GFX9-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: func_mov_fi_i32_offset: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s32 +; GFX9-FLATSCR-NEXT: s_add_i32 s0, s32, 4 +; GFX9-FLATSCR-NEXT: ds_write_b32 v0, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-FLATSCR-NEXT: ds_write_b32 v0, v0 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: func_mov_fi_i32_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_add_i32 s0, s32, 4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: ds_store_b32 v0, v0 +; GFX11-TRUE16-NEXT: ds_store_b32 v0, v1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: func_mov_fi_i32_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_add_i32 s0, s32, 4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, s0 +; GFX11-FAKE16-NEXT: ds_store_b32 v0, v0 +; GFX11-FAKE16-NEXT: ds_store_b32 v0, v1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca i32, addrspace(5) %alloca1 = alloca i32, addrspace(5) store volatile ptr addrspace(5) %alloca0, ptr addrspace(3) poison @@ -71,6 +166,48 @@ define void @func_mov_fi_i32_offset() #0 { ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 define void @func_add_constant_to_fi_i32() #0 { +; CI-LABEL: func_add_constant_to_fi_i32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshr_b32_e64 v1, s32, 6 +; CI-NEXT: v_add_i32_e32 v0, vcc, 4, v1 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-MUBUF-LABEL: func_add_constant_to_fi_i32: +; GFX9-MUBUF: ; %bb.0: +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-MUBUF-NEXT: v_lshrrev_b32_e64 v1, 6, s32 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 4, v1 +; GFX9-MUBUF-NEXT: ds_write_b32 v0, v0 +; GFX9-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: func_add_constant_to_fi_i32: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: v_add_u32_e64 v0, 4, s32 +; GFX9-FLATSCR-NEXT: ds_write_b32 v0, v0 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: func_add_constant_to_fi_i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e64 v0, 4, s32 +; GFX11-TRUE16-NEXT: ds_store_b32 v0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: func_add_constant_to_fi_i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e64 v0, 4, s32 +; GFX11-FAKE16-NEXT: ds_store_b32 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [2 x i32], align 4, addrspace(5) %gep0 = getelementptr inbounds [2 x i32], ptr addrspace(5) %alloca, i32 0, i32 1 store volatile ptr addrspace(5) %gep0, ptr addrspace(3) poison @@ -93,6 +230,55 @@ define void @func_add_constant_to_fi_i32() #0 { ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 define void @func_other_fi_user_i32() #0 { +; CI-LABEL: func_other_fi_user_i32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s5, s32, 6 +; CI-NEXT: s_mul_i32 s4, s5, 9 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-MUBUF-LABEL: func_other_fi_user_i32: +; GFX9-MUBUF: ; %bb.0: +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_lshr_b32 s5, s32, 6 +; GFX9-MUBUF-NEXT: s_mul_i32 s4, s5, 9 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-MUBUF-NEXT: ds_write_b32 v0, v0 +; GFX9-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: func_other_fi_user_i32: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_mul_i32 s0, s32, 9 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-FLATSCR-NEXT: ds_write_b32 v0, v0 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: func_other_fi_user_i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mul_i32 s0, s32, 9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-TRUE16-NEXT: ds_store_b32 v0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: func_other_fi_user_i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mul_i32 s0, s32, 9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-FAKE16-NEXT: ds_store_b32 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [2 x i32], align 4, addrspace(5) %ptrtoint = ptrtoint ptr addrspace(5) %alloca to i32 %mul = mul i32 %ptrtoint, 9 @@ -105,6 +291,45 @@ define void @func_other_fi_user_i32() #0 { ; MUBUF: buffer_store_dword v1, v0, s[0:3], 0 offen{{$}} ; GFX9-FLATSCR: scratch_store_dword v0, v1, off{{$}} define void @func_store_private_arg_i32_ptr(ptr addrspace(5) %ptr) #0 { +; CI-LABEL: func_store_private_arg_i32_ptr: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, 15 +; CI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-MUBUF-LABEL: func_store_private_arg_i32_ptr: +; GFX9-MUBUF: ; %bb.0: +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-MUBUF-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX9-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: func_store_private_arg_i32_ptr: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-FLATSCR-NEXT: scratch_store_dword v0, v1, off +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: func_store_private_arg_i32_ptr: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-TRUE16-NEXT: scratch_store_b32 v0, v1, off dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: func_store_private_arg_i32_ptr: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-FAKE16-NEXT: scratch_store_b32 v0, v1, off dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] store volatile i32 15, ptr addrspace(5) %ptr ret void } @@ -114,6 +339,40 @@ define void @func_store_private_arg_i32_ptr(ptr addrspace(5) %ptr) #0 { ; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc{{$}} ; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off glc{{$}} define void @func_load_private_arg_i32_ptr(ptr addrspace(5) %ptr) #0 { +; CI-LABEL: func_load_private_arg_i32_ptr: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-MUBUF-LABEL: func_load_private_arg_i32_ptr: +; GFX9-MUBUF: ; %bb.0: +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX9-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: func_load_private_arg_i32_ptr: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off glc +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: func_load_private_arg_i32_ptr: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, v0, off glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: func_load_private_arg_i32_ptr: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, v0, off glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = load volatile i32, ptr addrspace(5) %ptr ret void } @@ -132,6 +391,48 @@ define void @func_load_private_arg_i32_ptr(ptr addrspace(5) %ptr) #0 { ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 define void @void_func_byval_struct_i8_i32_ptr(ptr addrspace(5) byval({ i8, i32 }) %arg0) #0 { +; CI-LABEL: void_func_byval_struct_i8_i32_ptr: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshr_b32_e64 v1, s32, 6 +; CI-NEXT: v_or_b32_e32 v0, 4, v1 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-MUBUF-LABEL: void_func_byval_struct_i8_i32_ptr: +; GFX9-MUBUF: ; %bb.0: +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-MUBUF-NEXT: v_lshrrev_b32_e64 v1, 6, s32 +; GFX9-MUBUF-NEXT: v_or_b32_e32 v0, 4, v1 +; GFX9-MUBUF-NEXT: ds_write_b32 v0, v0 +; GFX9-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: void_func_byval_struct_i8_i32_ptr: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: v_or_b32_e64 v0, s32, 4 +; GFX9-FLATSCR-NEXT: ds_write_b32 v0, v0 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: void_func_byval_struct_i8_i32_ptr: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_or_b32_e64 v0, s32, 4 +; GFX11-TRUE16-NEXT: ds_store_b32 v0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: void_func_byval_struct_i8_i32_ptr: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_or_b32_e64 v0, s32, 4 +; GFX11-FAKE16-NEXT: ds_store_b32 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 0 %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 1 %load1 = load i32, ptr addrspace(5) %gep1 @@ -146,6 +447,68 @@ define void @void_func_byval_struct_i8_i32_ptr(ptr addrspace(5) byval({ i8, i32 ; GFX9-FLATSCR-NEXT: scratch_load_ubyte v0, off, s32 ; GFX9-FLATSCR-NEXT: scratch_load_dword v1, off, s32 offset:4 define void @void_func_byval_struct_i8_i32_ptr_value(ptr addrspace(5) byval({ i8, i32 }) %arg0) #0 { +; CI-LABEL: void_func_byval_struct_i8_i32_ptr_value: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_ubyte v0, off, s[0:3], s32 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: ds_write_b8 v0, v0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: ds_write_b32 v0, v1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-MUBUF-LABEL: void_func_byval_struct_i8_i32_ptr_value: +; GFX9-MUBUF: ; %bb.0: +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-MUBUF-NEXT: buffer_load_ubyte v0, off, s[0:3], s32 +; GFX9-MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(1) +; GFX9-MUBUF-NEXT: ds_write_b8 v0, v0 +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX9-MUBUF-NEXT: ds_write_b32 v0, v1 +; GFX9-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: void_func_byval_struct_i8_i32_ptr_value: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_load_ubyte v0, off, s32 +; GFX9-FLATSCR-NEXT: scratch_load_dword v1, off, s32 offset:4 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) +; GFX9-FLATSCR-NEXT: ds_write_b8 v0, v0 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: ds_write_b32 v0, v1 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: void_func_byval_struct_i8_i32_ptr_value: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v0, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: ds_store_b8 v0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: ds_store_b32 v0, v1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: void_func_byval_struct_i8_i32_ptr_value: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_u8 v0, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: ds_store_b8 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: ds_store_b32 v0, v1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 0 %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 1 %load0 = load i8, ptr addrspace(5) %gep0 @@ -173,6 +536,88 @@ define void @void_func_byval_struct_i8_i32_ptr_value(ptr addrspace(5) byval({ i8 ; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]] define void @void_func_byval_struct_i8_i32_ptr_nonentry_block(ptr addrspace(5) byval({ i8, i32 }) %arg0, i32 %arg2) #0 { +; CI-LABEL: void_func_byval_struct_i8_i32_ptr_nonentry_block: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CI-NEXT: s_cbranch_execz .LBB8_2 +; CI-NEXT: ; %bb.1: ; %bb +; CI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshr_b32_e64 v1, s32, 6 +; CI-NEXT: v_add_i32_e64 v0, s[6:7], 4, v1 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: .LBB8_2: ; %ret +; CI-NEXT: s_or_b64 exec, exec, s[4:5] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-MUBUF-LABEL: void_func_byval_struct_i8_i32_ptr_nonentry_block: +; GFX9-MUBUF: ; %bb.0: +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-MUBUF-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-MUBUF-NEXT: ; %bb.1: ; %bb +; GFX9-MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX9-MUBUF-NEXT: v_lshrrev_b32_e64 v1, 6, s32 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 4, v1 +; GFX9-MUBUF-NEXT: ds_write_b32 v0, v0 +; GFX9-MUBUF-NEXT: .LBB8_2: ; %ret +; GFX9-MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: void_func_byval_struct_i8_i32_ptr_nonentry_block: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-FLATSCR-NEXT: ; %bb.1: ; %bb +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:4 glc +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: v_add_u32_e64 v0, 4, s32 +; GFX9-FLATSCR-NEXT: ds_write_b32 v0, v0 +; GFX9-FLATSCR-NEXT: .LBB8_2: ; %ret +; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: void_func_byval_struct_i8_i32_ptr_nonentry_block: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %bb +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e64 v0, 4, s32 +; GFX11-TRUE16-NEXT: ds_store_b32 v0, v0 +; GFX11-TRUE16-NEXT: .LBB8_2: ; %ret +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: void_func_byval_struct_i8_i32_ptr_nonentry_block: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %bb +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u32_e64 v0, 4, s32 +; GFX11-FAKE16-NEXT: ds_store_b32 v0, v0 +; GFX11-FAKE16-NEXT: .LBB8_2: ; %ret +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %arg2, 0 br i1 %cmp, label %bb, label %ret @@ -202,6 +647,73 @@ ret: ; GCN: ds_write_b32 v0, [[VZ]] define void @func_other_fi_user_non_inline_imm_offset_i32() #0 { +; CI-LABEL: func_other_fi_user_non_inline_imm_offset_i32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s5, s32, 6 +; CI-NEXT: s_addk_i32 s5, 0x200 +; CI-NEXT: v_mov_b32_e32 v0, 7 +; CI-NEXT: s_mul_i32 s4, s5, 9 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-MUBUF-LABEL: func_other_fi_user_non_inline_imm_offset_i32: +; GFX9-MUBUF: ; %bb.0: +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_lshr_b32 s5, s32, 6 +; GFX9-MUBUF-NEXT: s_addk_i32 s5, 0x200 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v0, 7 +; GFX9-MUBUF-NEXT: s_mul_i32 s4, s5, 9 +; GFX9-MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-MUBUF-NEXT: ds_write_b32 v0, v0 +; GFX9-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: func_other_fi_user_non_inline_imm_offset_i32: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_add_i32 s1, s32, 0x200 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 7 +; GFX9-FLATSCR-NEXT: s_mul_i32 s0, s1, 9 +; GFX9-FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:260 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-FLATSCR-NEXT: ds_write_b32 v0, v0 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: func_other_fi_user_non_inline_imm_offset_i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_add_i32 s1, s32, 0x200 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_mul_i32 s0, s1, 9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 7 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:260 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ds_store_b32 v0, v1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: func_other_fi_user_non_inline_imm_offset_i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_add_i32 s1, s32, 0x200 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_mul_i32 s0, s1, 9 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 7 :: v_dual_mov_b32 v1, s0 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:260 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ds_store_b32 v0, v1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [128 x i32], align 4, addrspace(5) %alloca1 = alloca [8 x i32], align 4, addrspace(5) %gep0 = getelementptr inbounds [128 x i32], ptr addrspace(5) %alloca0, i32 0, i32 65 @@ -225,6 +737,103 @@ define void @func_other_fi_user_non_inline_imm_offset_i32() #0 { ; GCN: ds_write_b32 v0, [[VZ]] define void @func_other_fi_user_non_inline_imm_offset_i32_vcc_live() #0 { +; CI-LABEL: func_other_fi_user_non_inline_imm_offset_i32_vcc_live: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s5, s32, 6 +; CI-NEXT: s_addk_i32 s5, 0x200 +; CI-NEXT: v_mov_b32_e32 v0, 7 +; CI-NEXT: s_mul_i32 s4, s5, 9 +; CI-NEXT: ;;#ASMSTART +; CI-NEXT: ; def vcc +; CI-NEXT: ;;#ASMEND +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ;;#ASMSTART +; CI-NEXT: ; use vcc +; CI-NEXT: ;;#ASMEND +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-MUBUF-LABEL: func_other_fi_user_non_inline_imm_offset_i32_vcc_live: +; GFX9-MUBUF: ; %bb.0: +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_lshr_b32 s5, s32, 6 +; GFX9-MUBUF-NEXT: s_addk_i32 s5, 0x200 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v0, 7 +; GFX9-MUBUF-NEXT: s_mul_i32 s4, s5, 9 +; GFX9-MUBUF-NEXT: ;;#ASMSTART +; GFX9-MUBUF-NEXT: ; def vcc +; GFX9-MUBUF-NEXT: ;;#ASMEND +; GFX9-MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-MUBUF-NEXT: ;;#ASMSTART +; GFX9-MUBUF-NEXT: ; use vcc +; GFX9-MUBUF-NEXT: ;;#ASMEND +; GFX9-MUBUF-NEXT: ds_write_b32 v0, v0 +; GFX9-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: func_other_fi_user_non_inline_imm_offset_i32_vcc_live: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_add_i32 s1, s32, 0x200 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 7 +; GFX9-FLATSCR-NEXT: s_mul_i32 s0, s1, 9 +; GFX9-FLATSCR-NEXT: ;;#ASMSTART +; GFX9-FLATSCR-NEXT: ; def vcc +; GFX9-FLATSCR-NEXT: ;;#ASMEND +; GFX9-FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:260 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-FLATSCR-NEXT: ;;#ASMSTART +; GFX9-FLATSCR-NEXT: ; use vcc +; GFX9-FLATSCR-NEXT: ;;#ASMEND +; GFX9-FLATSCR-NEXT: ds_write_b32 v0, v0 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: func_other_fi_user_non_inline_imm_offset_i32_vcc_live: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_add_i32 s1, s32, 0x200 +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ; def vcc +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: s_mul_i32 s0, s1, 9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 7 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:260 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ; use vcc +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: ds_store_b32 v0, v1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: func_other_fi_user_non_inline_imm_offset_i32_vcc_live: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_add_i32 s1, s32, 0x200 +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; def vcc +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_mul_i32 s0, s1, 9 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 7 :: v_dual_mov_b32 v1, s0 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:260 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use vcc +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: ds_store_b32 v0, v1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca [128 x i32], align 4, addrspace(5) %alloca1 = alloca [8 x i32], align 4, addrspace(5) %vcc = call i64 asm sideeffect "; def $0", "={vcc}"() @@ -253,6 +862,486 @@ declare void @func(ptr addrspace(5) nocapture) #0 ; FLATSCR: scratch_store_dword v0, off, s33 offset: ; FLATSCR: scratch_store_dword v{{[0-9]+}}, off, s33 offset: define void @undefined_stack_store_reg(float %arg, i32 %arg1) #0 { +; CI-LABEL: undefined_stack_store_reg: +; CI: ; %bb.0: ; %bb +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s16, s33 +; CI-NEXT: s_mov_b32 s33, s32 +; CI-NEXT: s_or_saveexec_b64 s[18:19], -1 +; CI-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; CI-NEXT: s_mov_b64 exec, s[18:19] +; CI-NEXT: v_writelane_b32 v42, s16, 18 +; CI-NEXT: v_writelane_b32 v42, s30, 0 +; CI-NEXT: v_writelane_b32 v42, s31, 1 +; CI-NEXT: v_writelane_b32 v42, s34, 2 +; CI-NEXT: v_writelane_b32 v42, s35, 3 +; CI-NEXT: v_writelane_b32 v42, s36, 4 +; CI-NEXT: v_writelane_b32 v42, s37, 5 +; CI-NEXT: v_writelane_b32 v42, s38, 6 +; CI-NEXT: v_writelane_b32 v42, s39, 7 +; CI-NEXT: v_writelane_b32 v42, s48, 8 +; CI-NEXT: v_writelane_b32 v42, s49, 9 +; CI-NEXT: v_writelane_b32 v42, s50, 10 +; CI-NEXT: v_writelane_b32 v42, s51, 11 +; CI-NEXT: v_writelane_b32 v42, s52, 12 +; CI-NEXT: v_writelane_b32 v42, s53, 13 +; CI-NEXT: v_writelane_b32 v42, s54, 14 +; CI-NEXT: v_writelane_b32 v42, s55, 15 +; CI-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CI-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; CI-NEXT: v_writelane_b32 v42, s64, 16 +; CI-NEXT: v_mov_b32_e32 v40, v0 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: s_addk_i32 s32, 0xc00 +; CI-NEXT: v_writelane_b32 v42, s65, 17 +; CI-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen +; CI-NEXT: s_and_saveexec_b64 s[54:55], vcc +; CI-NEXT: s_cbranch_execz .LBB11_2 +; CI-NEXT: ; %bb.1: ; %bb4 +; CI-NEXT: s_getpc_b64 s[16:17] +; CI-NEXT: s_add_u32 s16, s16, func@gotpcrel32@lo+4 +; CI-NEXT: s_addc_u32 s17, s17, func@gotpcrel32@hi+12 +; CI-NEXT: s_load_dwordx2 s[64:65], s[16:17], 0x0 +; CI-NEXT: s_mov_b64 s[34:35], s[4:5] +; CI-NEXT: s_mov_b64 s[36:37], s[6:7] +; CI-NEXT: s_mov_b64 s[38:39], s[8:9] +; CI-NEXT: s_mov_b64 s[48:49], s[10:11] +; CI-NEXT: s_mov_b32 s50, s12 +; CI-NEXT: s_mov_b32 s51, s13 +; CI-NEXT: s_mov_b32 s52, s14 +; CI-NEXT: s_mov_b32 s53, s15 +; CI-NEXT: v_mov_b32_e32 v41, v31 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_swappc_b64 s[30:31], s[64:65] +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:28 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:24 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:20 +; CI-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 +; CI-NEXT: v_lshr_b32_e64 v0, s33, 6 +; CI-NEXT: s_mov_b64 s[4:5], s[34:35] +; CI-NEXT: s_mov_b64 s[6:7], s[36:37] +; CI-NEXT: s_mov_b64 s[8:9], s[38:39] +; CI-NEXT: s_mov_b64 s[10:11], s[48:49] +; CI-NEXT: s_mov_b32 s12, s50 +; CI-NEXT: s_mov_b32 s13, s51 +; CI-NEXT: s_mov_b32 s14, s52 +; CI-NEXT: s_mov_b32 s15, s53 +; CI-NEXT: v_mov_b32_e32 v31, v41 +; CI-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; CI-NEXT: s_swappc_b64 s[30:31], s[64:65] +; CI-NEXT: .LBB11_2: ; %bb5 +; CI-NEXT: s_or_b64 exec, exec, s[54:55] +; CI-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CI-NEXT: v_readlane_b32 s65, v42, 17 +; CI-NEXT: v_readlane_b32 s64, v42, 16 +; CI-NEXT: v_readlane_b32 s55, v42, 15 +; CI-NEXT: v_readlane_b32 s54, v42, 14 +; CI-NEXT: v_readlane_b32 s53, v42, 13 +; CI-NEXT: v_readlane_b32 s52, v42, 12 +; CI-NEXT: v_readlane_b32 s51, v42, 11 +; CI-NEXT: v_readlane_b32 s50, v42, 10 +; CI-NEXT: v_readlane_b32 s49, v42, 9 +; CI-NEXT: v_readlane_b32 s48, v42, 8 +; CI-NEXT: v_readlane_b32 s39, v42, 7 +; CI-NEXT: v_readlane_b32 s38, v42, 6 +; CI-NEXT: v_readlane_b32 s37, v42, 5 +; CI-NEXT: v_readlane_b32 s36, v42, 4 +; CI-NEXT: v_readlane_b32 s35, v42, 3 +; CI-NEXT: v_readlane_b32 s34, v42, 2 +; CI-NEXT: v_readlane_b32 s31, v42, 1 +; CI-NEXT: v_readlane_b32 s30, v42, 0 +; CI-NEXT: s_mov_b32 s32, s33 +; CI-NEXT: v_readlane_b32 s4, v42, 18 +; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CI-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; CI-NEXT: s_mov_b64 exec, s[6:7] +; CI-NEXT: s_mov_b32 s33, s4 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-MUBUF-LABEL: undefined_stack_store_reg: +; GFX9-MUBUF: ; %bb.0: ; %bb +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_mov_b32 s16, s33 +; GFX9-MUBUF-NEXT: s_mov_b32 s33, s32 +; GFX9-MUBUF-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-MUBUF-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX9-MUBUF-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s16, 18 +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s30, 0 +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s31, 1 +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s34, 2 +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s35, 3 +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s36, 4 +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s37, 5 +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s38, 6 +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s39, 7 +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s48, 8 +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s49, 9 +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s50, 10 +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s51, 11 +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s52, 12 +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s53, 13 +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s54, 14 +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s55, 15 +; GFX9-MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s64, 16 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-MUBUF-NEXT: s_addk_i32 s32, 0xc00 +; GFX9-MUBUF-NEXT: v_writelane_b32 v42, s65, 17 +; GFX9-MUBUF-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen +; GFX9-MUBUF-NEXT: s_and_saveexec_b64 s[54:55], vcc +; GFX9-MUBUF-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-MUBUF-NEXT: ; %bb.1: ; %bb4 +; GFX9-MUBUF-NEXT: s_getpc_b64 s[16:17] +; GFX9-MUBUF-NEXT: s_add_u32 s16, s16, func@gotpcrel32@lo+4 +; GFX9-MUBUF-NEXT: s_addc_u32 s17, s17, func@gotpcrel32@hi+12 +; GFX9-MUBUF-NEXT: s_load_dwordx2 s[64:65], s[16:17], 0x0 +; GFX9-MUBUF-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-MUBUF-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX9-MUBUF-NEXT: s_mov_b64 s[38:39], s[8:9] +; GFX9-MUBUF-NEXT: s_mov_b64 s[48:49], s[10:11] +; GFX9-MUBUF-NEXT: s_mov_b32 s50, s12 +; GFX9-MUBUF-NEXT: s_mov_b32 s51, s13 +; GFX9-MUBUF-NEXT: s_mov_b32 s52, s14 +; GFX9-MUBUF-NEXT: s_mov_b32 s53, s15 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v41, v31 +; GFX9-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_swappc_b64 s[30:31], s[64:65] +; GFX9-MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:28 +; GFX9-MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:24 +; GFX9-MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:20 +; GFX9-MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 +; GFX9-MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s33 +; GFX9-MUBUF-NEXT: s_mov_b64 s[4:5], s[34:35] +; GFX9-MUBUF-NEXT: s_mov_b64 s[6:7], s[36:37] +; GFX9-MUBUF-NEXT: s_mov_b64 s[8:9], s[38:39] +; GFX9-MUBUF-NEXT: s_mov_b64 s[10:11], s[48:49] +; GFX9-MUBUF-NEXT: s_mov_b32 s12, s50 +; GFX9-MUBUF-NEXT: s_mov_b32 s13, s51 +; GFX9-MUBUF-NEXT: s_mov_b32 s14, s52 +; GFX9-MUBUF-NEXT: s_mov_b32 s15, s53 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v31, v41 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 16, v0 +; GFX9-MUBUF-NEXT: s_swappc_b64 s[30:31], s[64:65] +; GFX9-MUBUF-NEXT: .LBB11_2: ; %bb5 +; GFX9-MUBUF-NEXT: s_or_b64 exec, exec, s[54:55] +; GFX9-MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-MUBUF-NEXT: v_readlane_b32 s65, v42, 17 +; GFX9-MUBUF-NEXT: v_readlane_b32 s64, v42, 16 +; GFX9-MUBUF-NEXT: v_readlane_b32 s55, v42, 15 +; GFX9-MUBUF-NEXT: v_readlane_b32 s54, v42, 14 +; GFX9-MUBUF-NEXT: v_readlane_b32 s53, v42, 13 +; GFX9-MUBUF-NEXT: v_readlane_b32 s52, v42, 12 +; GFX9-MUBUF-NEXT: v_readlane_b32 s51, v42, 11 +; GFX9-MUBUF-NEXT: v_readlane_b32 s50, v42, 10 +; GFX9-MUBUF-NEXT: v_readlane_b32 s49, v42, 9 +; GFX9-MUBUF-NEXT: v_readlane_b32 s48, v42, 8 +; GFX9-MUBUF-NEXT: v_readlane_b32 s39, v42, 7 +; GFX9-MUBUF-NEXT: v_readlane_b32 s38, v42, 6 +; GFX9-MUBUF-NEXT: v_readlane_b32 s37, v42, 5 +; GFX9-MUBUF-NEXT: v_readlane_b32 s36, v42, 4 +; GFX9-MUBUF-NEXT: v_readlane_b32 s35, v42, 3 +; GFX9-MUBUF-NEXT: v_readlane_b32 s34, v42, 2 +; GFX9-MUBUF-NEXT: v_readlane_b32 s31, v42, 1 +; GFX9-MUBUF-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-MUBUF-NEXT: s_mov_b32 s32, s33 +; GFX9-MUBUF-NEXT: v_readlane_b32 s4, v42, 18 +; GFX9-MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-MUBUF-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX9-MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-MUBUF-NEXT: s_mov_b32 s33, s4 +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX9-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: undefined_stack_store_reg: +; GFX9-FLATSCR: ; %bb.0: ; %bb +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_mov_b32 s0, s33 +; GFX9-FLATSCR-NEXT: s_mov_b32 s33, s32 +; GFX9-FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-FLATSCR-NEXT: scratch_store_dword off, v44, s33 offset:32 ; 4-byte Folded Spill +; GFX9-FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s0, 18 +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s30, 0 +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s31, 1 +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s34, 2 +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s35, 3 +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s36, 4 +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s37, 5 +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s38, 6 +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s39, 7 +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s48, 8 +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s49, 9 +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s50, 10 +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s51, 11 +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s52, 12 +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s53, 13 +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s54, 14 +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s55, 15 +; GFX9-FLATSCR-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s64, 16 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-FLATSCR-NEXT: s_add_i32 s32, s32, 48 +; GFX9-FLATSCR-NEXT: v_writelane_b32 v44, s65, 17 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[40:43], s0 +; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[54:55], vcc +; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-FLATSCR-NEXT: ; %bb.1: ; %bb4 +; GFX9-FLATSCR-NEXT: s_getpc_b64 s[0:1] +; GFX9-FLATSCR-NEXT: s_add_u32 s0, s0, func@gotpcrel32@lo+4 +; GFX9-FLATSCR-NEXT: s_addc_u32 s1, s1, func@gotpcrel32@hi+12 +; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[64:65], s[0:1], 0x0 +; GFX9-FLATSCR-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-FLATSCR-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX9-FLATSCR-NEXT: s_mov_b64 s[38:39], s[8:9] +; GFX9-FLATSCR-NEXT: s_mov_b64 s[48:49], s[10:11] +; GFX9-FLATSCR-NEXT: s_mov_b32 s50, s12 +; GFX9-FLATSCR-NEXT: s_mov_b32 s51, s13 +; GFX9-FLATSCR-NEXT: s_mov_b32 s52, s14 +; GFX9-FLATSCR-NEXT: s_mov_b32 s53, s15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v41, v31 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_swappc_b64 s[30:31], s[64:65] +; GFX9-FLATSCR-NEXT: s_add_i32 s0, s33, 16 +; GFX9-FLATSCR-NEXT: s_mov_b64 s[4:5], s[34:35] +; GFX9-FLATSCR-NEXT: s_mov_b64 s[6:7], s[36:37] +; GFX9-FLATSCR-NEXT: s_mov_b64 s[8:9], s[38:39] +; GFX9-FLATSCR-NEXT: s_mov_b64 s[10:11], s[48:49] +; GFX9-FLATSCR-NEXT: s_mov_b32 s12, s50 +; GFX9-FLATSCR-NEXT: s_mov_b32 s13, s51 +; GFX9-FLATSCR-NEXT: s_mov_b32 s14, s52 +; GFX9-FLATSCR-NEXT: s_mov_b32 s15, s53 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v31, v41 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[40:43], s33 offset:16 +; GFX9-FLATSCR-NEXT: s_swappc_b64 s[30:31], s[64:65] +; GFX9-FLATSCR-NEXT: .LBB11_2: ; %bb5 +; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[54:55] +; GFX9-FLATSCR-NEXT: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload +; GFX9-FLATSCR-NEXT: scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload +; GFX9-FLATSCR-NEXT: v_readlane_b32 s65, v44, 17 +; GFX9-FLATSCR-NEXT: v_readlane_b32 s64, v44, 16 +; GFX9-FLATSCR-NEXT: v_readlane_b32 s55, v44, 15 +; GFX9-FLATSCR-NEXT: v_readlane_b32 s54, v44, 14 +; GFX9-FLATSCR-NEXT: v_readlane_b32 s53, v44, 13 +; GFX9-FLATSCR-NEXT: v_readlane_b32 s52, v44, 12 +; GFX9-FLATSCR-NEXT: v_readlane_b32 s51, v44, 11 +; GFX9-FLATSCR-NEXT: v_readlane_b32 s50, v44, 10 +; GFX9-FLATSCR-NEXT: v_readlane_b32 s49, v44, 9 +; GFX9-FLATSCR-NEXT: v_readlane_b32 s48, v44, 8 +; GFX9-FLATSCR-NEXT: v_readlane_b32 s39, v44, 7 +; GFX9-FLATSCR-NEXT: v_readlane_b32 s38, v44, 6 +; GFX9-FLATSCR-NEXT: v_readlane_b32 s37, v44, 5 +; GFX9-FLATSCR-NEXT: v_readlane_b32 s36, v44, 4 +; GFX9-FLATSCR-NEXT: v_readlane_b32 s35, v44, 3 +; GFX9-FLATSCR-NEXT: v_readlane_b32 s34, v44, 2 +; GFX9-FLATSCR-NEXT: v_readlane_b32 s31, v44, 1 +; GFX9-FLATSCR-NEXT: v_readlane_b32 s30, v44, 0 +; GFX9-FLATSCR-NEXT: s_mov_b32 s32, s33 +; GFX9-FLATSCR-NEXT: v_readlane_b32 s0, v44, 18 +; GFX9-FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-FLATSCR-NEXT: scratch_load_dword v44, off, s33 offset:32 ; 4-byte Folded Reload +; GFX9-FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-FLATSCR-NEXT: s_mov_b32 s33, s0 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: undefined_stack_store_reg: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s33 +; GFX11-TRUE16-NEXT: s_mov_b32 s33, s32 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s33 offset:32 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v44, s0, 17 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 offset:4 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s33 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v40, v0 +; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 48 +; GFX11-TRUE16-NEXT: v_writelane_b32 v44, s30, 0 +; GFX11-TRUE16-NEXT: scratch_store_b128 off, v[40:43], s0 +; GFX11-TRUE16-NEXT: v_writelane_b32 v44, s31, 1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v44, s34, 2 +; GFX11-TRUE16-NEXT: v_writelane_b32 v44, s35, 3 +; GFX11-TRUE16-NEXT: v_writelane_b32 v44, s36, 4 +; GFX11-TRUE16-NEXT: v_writelane_b32 v44, s37, 5 +; GFX11-TRUE16-NEXT: v_writelane_b32 v44, s38, 6 +; GFX11-TRUE16-NEXT: v_writelane_b32 v44, s39, 7 +; GFX11-TRUE16-NEXT: v_writelane_b32 v44, s48, 8 +; GFX11-TRUE16-NEXT: v_writelane_b32 v44, s49, 9 +; GFX11-TRUE16-NEXT: v_writelane_b32 v44, s50, 10 +; GFX11-TRUE16-NEXT: v_writelane_b32 v44, s51, 11 +; GFX11-TRUE16-NEXT: v_writelane_b32 v44, s52, 12 +; GFX11-TRUE16-NEXT: v_writelane_b32 v44, s53, 13 +; GFX11-TRUE16-NEXT: v_writelane_b32 v44, s54, 14 +; GFX11-TRUE16-NEXT: s_mov_b32 s54, exec_lo +; GFX11-TRUE16-NEXT: v_writelane_b32 v44, s64, 15 +; GFX11-TRUE16-NEXT: v_writelane_b32 v44, s65, 16 +; GFX11-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %bb4 +; GFX11-TRUE16-NEXT: s_getpc_b64 s[0:1] +; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, func@gotpcrel32@lo+4 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, func@gotpcrel32@hi+12 +; GFX11-TRUE16-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX11-TRUE16-NEXT: s_load_b64 s[64:65], s[0:1], 0x0 +; GFX11-TRUE16-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX11-TRUE16-NEXT: s_mov_b64 s[38:39], s[8:9] +; GFX11-TRUE16-NEXT: s_mov_b64 s[48:49], s[10:11] +; GFX11-TRUE16-NEXT: s_mov_b32 s50, s12 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v41, v31 +; GFX11-TRUE16-NEXT: s_mov_b32 s51, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s52, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s53, s15 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[64:65] +; GFX11-TRUE16-NEXT: s_add_i32 s0, s33, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v41 :: v_dual_mov_b32 v0, s0 +; GFX11-TRUE16-NEXT: s_mov_b64 s[4:5], s[34:35] +; GFX11-TRUE16-NEXT: s_mov_b64 s[6:7], s[36:37] +; GFX11-TRUE16-NEXT: s_mov_b64 s[8:9], s[38:39] +; GFX11-TRUE16-NEXT: s_mov_b64 s[10:11], s[48:49] +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s50 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s51 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s52 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s53 +; GFX11-TRUE16-NEXT: scratch_store_b128 off, v[40:43], s33 offset:16 +; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[64:65] +; GFX11-TRUE16-NEXT: .LBB11_2: ; %bb5 +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s54 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s65, v44, 16 +; GFX11-TRUE16-NEXT: v_readlane_b32 s64, v44, 15 +; GFX11-TRUE16-NEXT: v_readlane_b32 s54, v44, 14 +; GFX11-TRUE16-NEXT: v_readlane_b32 s53, v44, 13 +; GFX11-TRUE16-NEXT: v_readlane_b32 s52, v44, 12 +; GFX11-TRUE16-NEXT: v_readlane_b32 s51, v44, 11 +; GFX11-TRUE16-NEXT: v_readlane_b32 s50, v44, 10 +; GFX11-TRUE16-NEXT: v_readlane_b32 s49, v44, 9 +; GFX11-TRUE16-NEXT: v_readlane_b32 s48, v44, 8 +; GFX11-TRUE16-NEXT: v_readlane_b32 s39, v44, 7 +; GFX11-TRUE16-NEXT: v_readlane_b32 s38, v44, 6 +; GFX11-TRUE16-NEXT: v_readlane_b32 s37, v44, 5 +; GFX11-TRUE16-NEXT: v_readlane_b32 s36, v44, 4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s35, v44, 3 +; GFX11-TRUE16-NEXT: v_readlane_b32 s34, v44, 2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v44, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v44, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v44, 17 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s33 offset:32 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s33, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: undefined_stack_store_reg: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s0, s33 +; GFX11-FAKE16-NEXT: s_mov_b32 s33, s32 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s33 offset:32 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v44, s0, 17 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 offset:4 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s33 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v40, v0 +; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 48 +; GFX11-FAKE16-NEXT: v_writelane_b32 v44, s30, 0 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[40:43], s0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v44, s31, 1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v44, s34, 2 +; GFX11-FAKE16-NEXT: v_writelane_b32 v44, s35, 3 +; GFX11-FAKE16-NEXT: v_writelane_b32 v44, s36, 4 +; GFX11-FAKE16-NEXT: v_writelane_b32 v44, s37, 5 +; GFX11-FAKE16-NEXT: v_writelane_b32 v44, s38, 6 +; GFX11-FAKE16-NEXT: v_writelane_b32 v44, s39, 7 +; GFX11-FAKE16-NEXT: v_writelane_b32 v44, s48, 8 +; GFX11-FAKE16-NEXT: v_writelane_b32 v44, s49, 9 +; GFX11-FAKE16-NEXT: v_writelane_b32 v44, s50, 10 +; GFX11-FAKE16-NEXT: v_writelane_b32 v44, s51, 11 +; GFX11-FAKE16-NEXT: v_writelane_b32 v44, s52, 12 +; GFX11-FAKE16-NEXT: v_writelane_b32 v44, s53, 13 +; GFX11-FAKE16-NEXT: v_writelane_b32 v44, s54, 14 +; GFX11-FAKE16-NEXT: s_mov_b32 s54, exec_lo +; GFX11-FAKE16-NEXT: v_writelane_b32 v44, s64, 15 +; GFX11-FAKE16-NEXT: v_writelane_b32 v44, s65, 16 +; GFX11-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %bb4 +; GFX11-FAKE16-NEXT: s_getpc_b64 s[0:1] +; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, func@gotpcrel32@lo+4 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, func@gotpcrel32@hi+12 +; GFX11-FAKE16-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX11-FAKE16-NEXT: s_load_b64 s[64:65], s[0:1], 0x0 +; GFX11-FAKE16-NEXT: s_mov_b64 s[36:37], s[6:7] +; GFX11-FAKE16-NEXT: s_mov_b64 s[38:39], s[8:9] +; GFX11-FAKE16-NEXT: s_mov_b64 s[48:49], s[10:11] +; GFX11-FAKE16-NEXT: s_mov_b32 s50, s12 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v41, v31 +; GFX11-FAKE16-NEXT: s_mov_b32 s51, s13 +; GFX11-FAKE16-NEXT: s_mov_b32 s52, s14 +; GFX11-FAKE16-NEXT: s_mov_b32 s53, s15 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[64:65] +; GFX11-FAKE16-NEXT: s_add_i32 s0, s33, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v31, v41 :: v_dual_mov_b32 v0, s0 +; GFX11-FAKE16-NEXT: s_mov_b64 s[4:5], s[34:35] +; GFX11-FAKE16-NEXT: s_mov_b64 s[6:7], s[36:37] +; GFX11-FAKE16-NEXT: s_mov_b64 s[8:9], s[38:39] +; GFX11-FAKE16-NEXT: s_mov_b64 s[10:11], s[48:49] +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s50 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s51 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s52 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s53 +; GFX11-FAKE16-NEXT: scratch_store_b128 off, v[40:43], s33 offset:16 +; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[64:65] +; GFX11-FAKE16-NEXT: .LBB11_2: ; %bb5 +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s54 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s33 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s65, v44, 16 +; GFX11-FAKE16-NEXT: v_readlane_b32 s64, v44, 15 +; GFX11-FAKE16-NEXT: v_readlane_b32 s54, v44, 14 +; GFX11-FAKE16-NEXT: v_readlane_b32 s53, v44, 13 +; GFX11-FAKE16-NEXT: v_readlane_b32 s52, v44, 12 +; GFX11-FAKE16-NEXT: v_readlane_b32 s51, v44, 11 +; GFX11-FAKE16-NEXT: v_readlane_b32 s50, v44, 10 +; GFX11-FAKE16-NEXT: v_readlane_b32 s49, v44, 9 +; GFX11-FAKE16-NEXT: v_readlane_b32 s48, v44, 8 +; GFX11-FAKE16-NEXT: v_readlane_b32 s39, v44, 7 +; GFX11-FAKE16-NEXT: v_readlane_b32 s38, v44, 6 +; GFX11-FAKE16-NEXT: v_readlane_b32 s37, v44, 5 +; GFX11-FAKE16-NEXT: v_readlane_b32 s36, v44, 4 +; GFX11-FAKE16-NEXT: v_readlane_b32 s35, v44, 3 +; GFX11-FAKE16-NEXT: v_readlane_b32 s34, v44, 2 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v44, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v44, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v44, 17 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s33 offset:32 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-FAKE16-NEXT: s_mov_b32 s33, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] bb: %tmp = alloca <4 x float>, align 16, addrspace(5) %tmp2 = insertelement <4 x float> poison, float %arg, i32 0 @@ -285,6 +1374,88 @@ bb5: ; GCN: ds_write_b32 v{{[0-9]+}}, [[PTR]] define void @alloca_ptr_nonentry_block(i32 %arg0) #0 { +; CI-LABEL: alloca_ptr_nonentry_block: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CI-NEXT: s_cbranch_execz .LBB12_2 +; CI-NEXT: ; %bb.1: ; %bb +; CI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshr_b32_e64 v1, s32, 6 +; CI-NEXT: v_or_b32_e32 v0, 4, v1 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: .LBB12_2: ; %ret +; CI-NEXT: s_or_b64 exec, exec, s[4:5] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-MUBUF-LABEL: alloca_ptr_nonentry_block: +; GFX9-MUBUF: ; %bb.0: +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-MUBUF-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-MUBUF-NEXT: ; %bb.1: ; %bb +; GFX9-MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX9-MUBUF-NEXT: v_lshrrev_b32_e64 v1, 6, s32 +; GFX9-MUBUF-NEXT: v_or_b32_e32 v0, 4, v1 +; GFX9-MUBUF-NEXT: ds_write_b32 v0, v0 +; GFX9-MUBUF-NEXT: .LBB12_2: ; %ret +; GFX9-MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: alloca_ptr_nonentry_block: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-FLATSCR-NEXT: ; %bb.1: ; %bb +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:4 glc +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: v_or_b32_e64 v0, s32, 4 +; GFX9-FLATSCR-NEXT: ds_write_b32 v0, v0 +; GFX9-FLATSCR-NEXT: .LBB12_2: ; %ret +; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: alloca_ptr_nonentry_block: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %bb +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_or_b32_e64 v0, s32, 4 +; GFX11-TRUE16-NEXT: ds_store_b32 v0, v0 +; GFX11-TRUE16-NEXT: .LBB12_2: ; %ret +; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: alloca_ptr_nonentry_block: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo +; GFX11-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-FAKE16-NEXT: ; %bb.1: ; %bb +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_or_b32_e64 v0, s32, 4 +; GFX11-FAKE16-NEXT: ds_store_b32 v0, v0 +; GFX11-FAKE16-NEXT: .LBB12_2: ; %ret +; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %alloca0 = alloca { i8, i32 }, align 8, addrspace(5) %cmp = icmp eq i32 %arg0, 0 br i1 %cmp, label %bb, label %ret @@ -319,6 +1490,79 @@ ret: ; GFX11-FAKE16-DAG: ds_store_b16 v{{[0-9]+}}, [[C]] offset:8 ; GFX11-FAKE16-NEXT: s_endpgm define protected amdgpu_kernel void @tied_operand_test(i1 %c1, i1 %c2, i32 %val) { +; CI-LABEL: tied_operand_test: +; CI: ; %bb.0: ; %entry +; CI-NEXT: s_add_u32 s0, s0, s17 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; CI-NEXT: s_load_dword s4, s[8:9], 0x1 +; CI-NEXT: v_mov_b32_e32 v1, 0x7b +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshl_b32 s4, s4, 1 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: ds_write_b16 v2, v1 offset:8 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: ds_write_b16 v2, v0 offset:10 +; CI-NEXT: s_endpgm +; +; GFX9-MUBUF-LABEL: tied_operand_test: +; GFX9-MUBUF: ; %bb.0: ; %entry +; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s17 +; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX9-MUBUF-NEXT: s_load_dword s4, s[8:9], 0x4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX9-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_lshl_b32 s4, s4, 1 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-MUBUF-NEXT: ds_write_b16 v2, v1 offset:8 +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX9-MUBUF-NEXT: ds_write_b16 v2, v0 offset:10 +; GFX9-MUBUF-NEXT: s_endpgm +; +; GFX9-FLATSCR-LABEL: tied_operand_test: +; GFX9-FLATSCR: ; %bb.0: ; %entry +; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 +; GFX9-FLATSCR-NEXT: scratch_load_ushort v0, off, s0 +; GFX9-FLATSCR-NEXT: s_load_dword s0, s[4:5], 0x4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-FLATSCR-NEXT: ds_write_b16 v2, v1 offset:8 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: ds_write_b16 v2, v0 offset:10 +; GFX9-FLATSCR-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: tied_operand_test: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, off +; GFX11-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x7b +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v1, v0 offset:8 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: ds_store_b16 v1, v0 offset:10 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: tied_operand_test: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, off +; GFX11-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_mov_b32 v2, s0 +; GFX11-FAKE16-NEXT: ds_store_b16 v2, v1 offset:8 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: ds_store_b16 v2, v0 offset:10 +; GFX11-FAKE16-NEXT: s_endpgm entry: %scratch0 = alloca i16, align 4, addrspace(5) %scratch1 = alloca i16, align 4, addrspace(5) @@ -345,6 +1589,115 @@ entry: ; GFX9-MUBUF-NEXT: v_add_u32_e32 [[SCALED_FP]], 0x3000, [[SCALED_FP]] ; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 64, [[SCALED_FP]] define void @fi_vop3_literal_error() { +; CI-LABEL: fi_vop3_literal_error: +; CI: ; %bb.0: ; %entry +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s4, s33 +; CI-NEXT: s_add_i32 s33, s32, 0x7ffc0 +; CI-NEXT: s_and_b32 s33, s33, 0xfff80000 +; CI-NEXT: v_lshr_b32_e64 v1, s33, 6 +; CI-NEXT: s_movk_i32 vcc_lo, 0x3000 +; CI-NEXT: v_add_i32_e32 v1, vcc, vcc_lo, v1 +; CI-NEXT: v_add_i32_e32 v0, vcc, 64, v1 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, 0x2000 +; CI-NEXT: buffer_store_dword v1, v2, s[0:3], s33 offen +; CI-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:4 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_mov_b32 s5, s34 +; CI-NEXT: s_mov_b32 s34, s32 +; CI-NEXT: s_add_i32 s32, s32, 0x200000 +; CI-NEXT: s_mov_b32 s32, s34 +; CI-NEXT: s_mov_b32 s34, s5 +; CI-NEXT: s_mov_b32 s33, s4 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-MUBUF-LABEL: fi_vop3_literal_error: +; GFX9-MUBUF: ; %bb.0: ; %entry +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_mov_b32 s4, s33 +; GFX9-MUBUF-NEXT: s_add_i32 s33, s32, 0x7ffc0 +; GFX9-MUBUF-NEXT: s_and_b32 s33, s33, 0xfff80000 +; GFX9-MUBUF-NEXT: v_lshrrev_b32_e64 v1, 6, s33 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x3000, v1 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 64, v1 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0x2000 +; GFX9-MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], s33 offen +; GFX9-MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen glc +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX9-MUBUF-NEXT: s_mov_b32 s5, s34 +; GFX9-MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 glc +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX9-MUBUF-NEXT: s_mov_b32 s34, s32 +; GFX9-MUBUF-NEXT: s_add_i32 s32, s32, 0x200000 +; GFX9-MUBUF-NEXT: ; kill: killed $vgpr0 +; GFX9-MUBUF-NEXT: s_mov_b32 s32, s34 +; GFX9-MUBUF-NEXT: s_mov_b32 s34, s5 +; GFX9-MUBUF-NEXT: s_mov_b32 s33, s4 +; GFX9-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: fi_vop3_literal_error: +; GFX9-FLATSCR: ; %bb.0: ; %entry +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_mov_b32 s0, s33 +; GFX9-FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff +; GFX9-FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX9-FLATSCR-NEXT: s_mov_b32 s1, s34 +; GFX9-FLATSCR-NEXT: s_mov_b32 s34, s32 +; GFX9-FLATSCR-NEXT: s_add_i32 s32, s32, 0x8000 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-FLATSCR-NEXT: s_add_i32 s2, s33, 0x2000 +; GFX9-FLATSCR-NEXT: scratch_store_dword off, v0, s2 +; GFX9-FLATSCR-NEXT: s_add_i32 s2, s33, 0x3000 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s2 offset:64 glc +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: s_mov_b32 s32, s34 +; GFX9-FLATSCR-NEXT: s_mov_b32 s34, s1 +; GFX9-FLATSCR-NEXT: s_mov_b32 s33, s0 +; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: fi_vop3_literal_error: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s33 +; GFX11-TRUE16-NEXT: s_add_i32 s33, s32, 0x1fff +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-TRUE16-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s34 +; GFX11-TRUE16-NEXT: s_mov_b32 s34, s32 +; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 0x8000 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s33, 0x2000 +; GFX11-TRUE16-NEXT: s_mov_b32 s32, s34 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v0, s2 +; GFX11-TRUE16-NEXT: s_add_i32 s2, s33, 0x3000 +; GFX11-TRUE16-NEXT: s_mov_b32 s34, s1 +; GFX11-TRUE16-NEXT: scratch_load_b64 v[0:1], off, s2 offset:64 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s33, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: fi_vop3_literal_error: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s0, s33 +; GFX11-FAKE16-NEXT: s_add_i32 s33, s32, 0x1fff +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, s34 +; GFX11-FAKE16-NEXT: s_mov_b32 s34, s32 +; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 0x8000 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s33, 0x2000 +; GFX11-FAKE16-NEXT: s_mov_b32 s32, s34 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v0, s2 +; GFX11-FAKE16-NEXT: s_add_i32 s2, s33, 0x3000 +; GFX11-FAKE16-NEXT: s_mov_b32 s34, s1 +; GFX11-FAKE16-NEXT: scratch_load_b64 v[0:1], off, s2 offset:64 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s33, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %pin.low = alloca i32, align 8192, addrspace(5) %local.area = alloca [1060 x i64], align 4096, addrspace(5) @@ -363,6 +1716,132 @@ entry: ; GCN: s_add_u32 [[ADD_LO:s[0-9]+]], 0, 0x2010 ; GCN: s_addc_u32 [[ADD_HI:s[0-9]+]], s{{[0-9]+}}, 0 define amdgpu_kernel void @fi_sop2_s_add_u32_literal_error() #0 { +; CI-LABEL: fi_sop2_s_add_u32_literal_error: +; CI: ; %bb.0: ; %entry +; CI-NEXT: s_load_dword s5, s[8:9], 0x30 +; CI-NEXT: s_add_u32 s0, s0, s17 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: s_add_u32 s4, 0, 0x2010 +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_addc_u32 s5, s5, 0 +; CI-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], 2 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 +; CI-NEXT: .LBB15_1: ; %.shuffle.then.i.i.i.i +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: s_and_b64 vcc, exec, s[4:5] +; CI-NEXT: s_cbranch_vccnz .LBB15_1 +; CI-NEXT: ; %bb.2: ; %vector.body.i.i.i.i +; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; GFX9-MUBUF-LABEL: fi_sop2_s_add_u32_literal_error: +; GFX9-MUBUF: ; %bb.0: ; %entry +; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s17 +; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX9-MUBUF-NEXT: s_add_u32 s4, 0, 0x2010 +; GFX9-MUBUF-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-MUBUF-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], 2 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX9-MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-MUBUF-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9-MUBUF-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 +; GFX9-MUBUF-NEXT: .LBB15_1: ; %.shuffle.then.i.i.i.i +; GFX9-MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-MUBUF-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX9-MUBUF-NEXT: s_cbranch_vccnz .LBB15_1 +; GFX9-MUBUF-NEXT: ; %bb.2: ; %vector.body.i.i.i.i +; GFX9-MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 +; GFX9-MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(1) +; GFX9-MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(1) +; GFX9-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX9-MUBUF-NEXT: s_endpgm +; +; GFX9-FLATSCR-LABEL: fi_sop2_s_add_u32_literal_error: +; GFX9-FLATSCR: ; %bb.0: ; %entry +; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9-FLATSCR-NEXT: s_add_u32 s0, 0, 0x2010 +; GFX9-FLATSCR-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-FLATSCR-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], 2 +; GFX9-FLATSCR-NEXT: s_mov_b32 s2, 0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s2 +; GFX9-FLATSCR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-FLATSCR-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX9-FLATSCR-NEXT: .LBB15_1: ; %.shuffle.then.i.i.i.i +; GFX9-FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-FLATSCR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX9-FLATSCR-NEXT: s_cbranch_vccnz .LBB15_1 +; GFX9-FLATSCR-NEXT: ; %bb.2: ; %vector.body.i.i.i.i +; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 +; GFX9-FLATSCR-NEXT: s_nop 1 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:4 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX9-FLATSCR-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: fi_sop2_s_add_u32_literal_error: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-TRUE16-NEXT: s_add_u32 s0, 0, 0x2010 +; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-TRUE16-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], 2 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-TRUE16-NEXT: scratch_store_b64 off, v[0:1], s1 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 1, v2 +; GFX11-TRUE16-NEXT: .LBB15_1: ; %.shuffle.then.i.i.i.i +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %vector.body.i.i.i.i +; GFX11-TRUE16-NEXT: scratch_load_b64 v[0:1], off, off offset:4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b64 off, v[0:1], s0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fi_sop2_s_add_u32_literal_error: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-FAKE16-NEXT: s_add_u32 s0, 0, 0x2010 +; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], 2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-FAKE16-NEXT: scratch_store_b64 off, v[0:1], s1 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e64 s0, 1, v2 +; GFX11-FAKE16-NEXT: .LBB15_1: ; %.shuffle.then.i.i.i.i +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %vector.body.i.i.i.i +; GFX11-FAKE16-NEXT: scratch_load_b64 v[0:1], off, off offset:4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b64 off, v[0:1], s0 +; GFX11-FAKE16-NEXT: s_endpgm entry: %.omp.reduction.element.i.i.i.i = alloca [1024 x i32], align 4, addrspace(5) %Total3.i.i = alloca [1024 x i32], align 16, addrspace(5) @@ -385,6 +1864,116 @@ vector.body.i.i.i.i: ; preds = %.shuffle.then.i.i.i ; GCN-LABEL: {{^}}fi_sop2_and_literal_error: ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1fe00 define amdgpu_kernel void @fi_sop2_and_literal_error() #0 { +; CI-LABEL: fi_sop2_and_literal_error: +; CI: ; %bb.0: ; %entry +; CI-NEXT: s_add_u32 s0, s0, s17 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_mov_b64 s[4:5], -1 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 +; CI-NEXT: .LBB16_1: ; %.shuffle.then.i.i.i.i +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: s_and_b64 vcc, exec, s[4:5] +; CI-NEXT: s_cbranch_vccnz .LBB16_1 +; CI-NEXT: ; %bb.2: ; %vector.body.i.i.i.i +; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; GFX9-MUBUF-LABEL: fi_sop2_and_literal_error: +; GFX9-MUBUF: ; %bb.0: ; %entry +; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s17 +; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-MUBUF-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX9-MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-MUBUF-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9-MUBUF-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 +; GFX9-MUBUF-NEXT: .LBB16_1: ; %.shuffle.then.i.i.i.i +; GFX9-MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-MUBUF-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX9-MUBUF-NEXT: s_cbranch_vccnz .LBB16_1 +; GFX9-MUBUF-NEXT: ; %bb.2: ; %vector.body.i.i.i.i +; GFX9-MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 +; GFX9-MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(1) +; GFX9-MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(1) +; GFX9-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX9-MUBUF-NEXT: s_endpgm +; +; GFX9-FLATSCR-LABEL: fi_sop2_and_literal_error: +; GFX9-FLATSCR: ; %bb.0: ; %entry +; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], -1 +; GFX9-FLATSCR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-FLATSCR-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX9-FLATSCR-NEXT: .LBB16_1: ; %.shuffle.then.i.i.i.i +; GFX9-FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-FLATSCR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX9-FLATSCR-NEXT: s_cbranch_vccnz .LBB16_1 +; GFX9-FLATSCR-NEXT: ; %bb.2: ; %vector.body.i.i.i.i +; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 +; GFX9-FLATSCR-NEXT: s_nop 1 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:4 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX9-FLATSCR-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: fi_sop2_and_literal_error: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_mov_b32 s0, -1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 1, v2 +; GFX11-TRUE16-NEXT: scratch_store_b64 off, v[0:1], s1 +; GFX11-TRUE16-NEXT: .LBB16_1: ; %.shuffle.then.i.i.i.i +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB16_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %vector.body.i.i.i.i +; GFX11-TRUE16-NEXT: scratch_load_b64 v[0:1], off, off offset:4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b64 off, v[0:1], s0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fi_sop2_and_literal_error: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_mov_b32 s0, -1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e64 s0, 1, v2 +; GFX11-FAKE16-NEXT: scratch_store_b64 off, v[0:1], s1 +; GFX11-FAKE16-NEXT: .LBB16_1: ; %.shuffle.then.i.i.i.i +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB16_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %vector.body.i.i.i.i +; GFX11-FAKE16-NEXT: scratch_load_b64 v[0:1], off, off offset:4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b64 off, v[0:1], s0 +; GFX11-FAKE16-NEXT: s_endpgm entry: %.omp.reduction.element.i.i.i.i = alloca [1024 x i32], align 4, addrspace(5) %Total3.i.i = alloca [1024 x i32], align 16, addrspace(5) @@ -406,6 +1995,116 @@ vector.body.i.i.i.i: ; preds = %.shuffle.then.i.i.i ; GCN-LABEL: {{^}}fi_sop2_or_literal_error: ; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3039 define amdgpu_kernel void @fi_sop2_or_literal_error() #0 { +; CI-LABEL: fi_sop2_or_literal_error: +; CI: ; %bb.0: ; %entry +; CI-NEXT: s_add_u32 s0, s0, s17 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_mov_b64 s[4:5], -1 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; CI-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 +; CI-NEXT: .LBB17_1: ; %.shuffle.then.i.i.i.i +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: s_and_b64 vcc, exec, s[4:5] +; CI-NEXT: s_cbranch_vccnz .LBB17_1 +; CI-NEXT: ; %bb.2: ; %vector.body.i.i.i.i +; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; GFX9-MUBUF-LABEL: fi_sop2_or_literal_error: +; GFX9-MUBUF: ; %bb.0: ; %entry +; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s17 +; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-MUBUF-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX9-MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-MUBUF-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9-MUBUF-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 +; GFX9-MUBUF-NEXT: .LBB17_1: ; %.shuffle.then.i.i.i.i +; GFX9-MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-MUBUF-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX9-MUBUF-NEXT: s_cbranch_vccnz .LBB17_1 +; GFX9-MUBUF-NEXT: ; %bb.2: ; %vector.body.i.i.i.i +; GFX9-MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 +; GFX9-MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(1) +; GFX9-MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(1) +; GFX9-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX9-MUBUF-NEXT: s_endpgm +; +; GFX9-FLATSCR-LABEL: fi_sop2_or_literal_error: +; GFX9-FLATSCR: ; %bb.0: ; %entry +; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], -1 +; GFX9-FLATSCR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-FLATSCR-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX9-FLATSCR-NEXT: .LBB17_1: ; %.shuffle.then.i.i.i.i +; GFX9-FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-FLATSCR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX9-FLATSCR-NEXT: s_cbranch_vccnz .LBB17_1 +; GFX9-FLATSCR-NEXT: ; %bb.2: ; %vector.body.i.i.i.i +; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 +; GFX9-FLATSCR-NEXT: s_nop 1 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:4 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX9-FLATSCR-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: fi_sop2_or_literal_error: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_mov_b32 s0, -1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 1, v2 +; GFX11-TRUE16-NEXT: scratch_store_b64 off, v[0:1], s1 +; GFX11-TRUE16-NEXT: .LBB17_1: ; %.shuffle.then.i.i.i.i +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB17_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %vector.body.i.i.i.i +; GFX11-TRUE16-NEXT: scratch_load_b64 v[0:1], off, off offset:4 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b64 off, v[0:1], s0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fi_sop2_or_literal_error: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_mov_b32 s0, -1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e64 s0, 1, v2 +; GFX11-FAKE16-NEXT: scratch_store_b64 off, v[0:1], s1 +; GFX11-FAKE16-NEXT: .LBB17_1: ; %.shuffle.then.i.i.i.i +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB17_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %vector.body.i.i.i.i +; GFX11-FAKE16-NEXT: scratch_load_b64 v[0:1], off, off offset:4 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b64 off, v[0:1], s0 +; GFX11-FAKE16-NEXT: s_endpgm entry: %.omp.reduction.element.i.i.i.i = alloca [1024 x i32], align 4, addrspace(5) %Total3.i.i = alloca [1024 x i32], align 16, addrspace(5) @@ -435,6 +2134,76 @@ vector.body.i.i.i.i: ; preds = %.shuffle.then.i.i.i ; GCN: s_mov_b32 [[ALLOCA0:s[0-9]+]], 0 ; GCN: ; use [[SELECT]], [[ALLOCA0]] define amdgpu_kernel void @s_multiple_frame_indexes_literal_offsets(i32 inreg %arg0) #0 { +; CI-LABEL: s_multiple_frame_indexes_literal_offsets: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s4, s[8:9], 0x0 +; CI-NEXT: s_add_u32 s0, s0, s17 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: s_movk_i32 s5, 0x44 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_cmp_eq_u32 s4, 0 +; CI-NEXT: s_cselect_b32 s4, s5, 0x48 +; CI-NEXT: s_mov_b32 s5, 0 +; CI-NEXT: ;;#ASMSTART +; CI-NEXT: ; use s4, s5 +; CI-NEXT: ;;#ASMEND +; CI-NEXT: s_endpgm +; +; GFX9-MUBUF-LABEL: s_multiple_frame_indexes_literal_offsets: +; GFX9-MUBUF: ; %bb.0: +; GFX9-MUBUF-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s17 +; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: s_movk_i32 s5, 0x44 +; GFX9-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-MUBUF-NEXT: s_cselect_b32 s4, s5, 0x48 +; GFX9-MUBUF-NEXT: s_mov_b32 s5, 0 +; GFX9-MUBUF-NEXT: ;;#ASMSTART +; GFX9-MUBUF-NEXT: ; use s4, s5 +; GFX9-MUBUF-NEXT: ;;#ASMEND +; GFX9-MUBUF-NEXT: s_endpgm +; +; GFX9-FLATSCR-LABEL: s_multiple_frame_indexes_literal_offsets: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GFX9-FLATSCR-NEXT: s_movk_i32 s1, 0x44 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_cmp_eq_u32 s0, 0 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s0, s1, 0x48 +; GFX9-FLATSCR-NEXT: s_mov_b32 s1, 0 +; GFX9-FLATSCR-NEXT: ;;#ASMSTART +; GFX9-FLATSCR-NEXT: ; use s0, s1 +; GFX9-FLATSCR-NEXT: ;;#ASMEND +; GFX9-FLATSCR-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: s_multiple_frame_indexes_literal_offsets: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-TRUE16-NEXT: s_movk_i32 s1, 0x44 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s0, s1, 0x48 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ; use s0, s1 +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: s_multiple_frame_indexes_literal_offsets: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-FAKE16-NEXT: s_movk_i32 s1, 0x44 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s1, 0x48 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use s0, s1 +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_endpgm %alloca0 = alloca [17 x i32], align 8, addrspace(5) %alloca1 = alloca i32, align 4, addrspace(5) %alloca2 = alloca i32, align 4, addrspace(5) @@ -455,6 +2224,76 @@ define amdgpu_kernel void @s_multiple_frame_indexes_literal_offsets(i32 inreg %a ; GCN: s_mov_b32 [[ALLOCA0:s[0-9]+]], 0 ; GCN: ; use [[SELECT]], [[ALLOCA0]] define amdgpu_kernel void @s_multiple_frame_indexes_one_imm_one_literal_offset(i32 inreg %arg0) #0 { +; CI-LABEL: s_multiple_frame_indexes_one_imm_one_literal_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s4, s[8:9], 0x0 +; CI-NEXT: s_add_u32 s0, s0, s17 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: s_mov_b32 s5, 64 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_cmp_eq_u32 s4, 0 +; CI-NEXT: s_cselect_b32 s4, s5, 0x44 +; CI-NEXT: s_mov_b32 s5, 0 +; CI-NEXT: ;;#ASMSTART +; CI-NEXT: ; use s4, s5 +; CI-NEXT: ;;#ASMEND +; CI-NEXT: s_endpgm +; +; GFX9-MUBUF-LABEL: s_multiple_frame_indexes_one_imm_one_literal_offset: +; GFX9-MUBUF: ; %bb.0: +; GFX9-MUBUF-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s17 +; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: s_mov_b32 s5, 64 +; GFX9-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-MUBUF-NEXT: s_cselect_b32 s4, s5, 0x44 +; GFX9-MUBUF-NEXT: s_mov_b32 s5, 0 +; GFX9-MUBUF-NEXT: ;;#ASMSTART +; GFX9-MUBUF-NEXT: ; use s4, s5 +; GFX9-MUBUF-NEXT: ;;#ASMEND +; GFX9-MUBUF-NEXT: s_endpgm +; +; GFX9-FLATSCR-LABEL: s_multiple_frame_indexes_one_imm_one_literal_offset: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GFX9-FLATSCR-NEXT: s_mov_b32 s1, 64 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_cmp_eq_u32 s0, 0 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s0, s1, 0x44 +; GFX9-FLATSCR-NEXT: s_mov_b32 s1, 0 +; GFX9-FLATSCR-NEXT: ;;#ASMSTART +; GFX9-FLATSCR-NEXT: ; use s0, s1 +; GFX9-FLATSCR-NEXT: ;;#ASMEND +; GFX9-FLATSCR-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: s_multiple_frame_indexes_one_imm_one_literal_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 64 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s0, s1, 0x44 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ; use s0, s1 +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: s_multiple_frame_indexes_one_imm_one_literal_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 64 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s1, 0x44 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use s0, s1 +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_endpgm %alloca0 = alloca [16 x i32], align 8, addrspace(5) %alloca1 = alloca i32, align 4, addrspace(5) %alloca2 = alloca i32, align 4, addrspace(5) @@ -472,6 +2311,76 @@ define amdgpu_kernel void @s_multiple_frame_indexes_one_imm_one_literal_offset(i ; GCN: s_mov_b32 [[ALLOCA0:s[0-9]+]], 0 ; GCN: ; use [[SELECT]], [[ALLOCA0]] define amdgpu_kernel void @s_multiple_frame_indexes_imm_offsets(i32 inreg %arg0) #0 { +; CI-LABEL: s_multiple_frame_indexes_imm_offsets: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s4, s[8:9], 0x0 +; CI-NEXT: s_add_u32 s0, s0, s17 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: s_mov_b32 s5, 16 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_cmp_eq_u32 s4, 0 +; CI-NEXT: s_cselect_b32 s4, s5, 20 +; CI-NEXT: s_mov_b32 s5, 0 +; CI-NEXT: ;;#ASMSTART +; CI-NEXT: ; use s4, s5 +; CI-NEXT: ;;#ASMEND +; CI-NEXT: s_endpgm +; +; GFX9-MUBUF-LABEL: s_multiple_frame_indexes_imm_offsets: +; GFX9-MUBUF: ; %bb.0: +; GFX9-MUBUF-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s17 +; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: s_mov_b32 s5, 16 +; GFX9-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-MUBUF-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-MUBUF-NEXT: s_cselect_b32 s4, s5, 20 +; GFX9-MUBUF-NEXT: s_mov_b32 s5, 0 +; GFX9-MUBUF-NEXT: ;;#ASMSTART +; GFX9-MUBUF-NEXT: ; use s4, s5 +; GFX9-MUBUF-NEXT: ;;#ASMEND +; GFX9-MUBUF-NEXT: s_endpgm +; +; GFX9-FLATSCR-LABEL: s_multiple_frame_indexes_imm_offsets: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GFX9-FLATSCR-NEXT: s_mov_b32 s1, 16 +; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLATSCR-NEXT: s_cmp_eq_u32 s0, 0 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s0, s1, 20 +; GFX9-FLATSCR-NEXT: s_mov_b32 s1, 0 +; GFX9-FLATSCR-NEXT: ;;#ASMSTART +; GFX9-FLATSCR-NEXT: ; use s0, s1 +; GFX9-FLATSCR-NEXT: ;;#ASMEND +; GFX9-FLATSCR-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: s_multiple_frame_indexes_imm_offsets: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 16 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s0, s1, 20 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ; use s0, s1 +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: s_multiple_frame_indexes_imm_offsets: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 16 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s1, 20 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use s0, s1 +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_endpgm %alloca0 = alloca [4 x i32], align 8, addrspace(5) %alloca1 = alloca i32, align 4, addrspace(5) %alloca2 = alloca i32, align 4, addrspace(5) @@ -489,6 +2398,71 @@ define amdgpu_kernel void @s_multiple_frame_indexes_imm_offsets(i32 inreg %arg0) ; GCN: v_mov_b32_e32 [[ALLOCA0:v[0-9]+]], 0{{$}} ; GCN: ; use [[SELECT]], [[ALLOCA0]] define amdgpu_kernel void @v_multiple_frame_indexes_literal_offsets() #0 { +; CI-LABEL: v_multiple_frame_indexes_literal_offsets: +; CI: ; %bb.0: +; CI-NEXT: s_add_u32 s0, s0, s17 +; CI-NEXT: v_mov_b32_e32 v1, 0x48 +; CI-NEXT: v_mov_b32_e32 v2, 0x44 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: ;;#ASMSTART +; CI-NEXT: ; use v0, v1 +; CI-NEXT: ;;#ASMEND +; CI-NEXT: s_endpgm +; +; GFX9-MUBUF-LABEL: v_multiple_frame_indexes_literal_offsets: +; GFX9-MUBUF: ; %bb.0: +; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s17 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v1, 0x48 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0x44 +; GFX9-MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-MUBUF-NEXT: ;;#ASMSTART +; GFX9-MUBUF-NEXT: ; use v0, v1 +; GFX9-MUBUF-NEXT: ;;#ASMEND +; GFX9-MUBUF-NEXT: s_endpgm +; +; GFX9-FLATSCR-LABEL: v_multiple_frame_indexes_literal_offsets: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x48 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0x44 +; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GFX9-FLATSCR-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-FLATSCR-NEXT: ;;#ASMSTART +; GFX9-FLATSCR-NEXT: ; use v0, v1 +; GFX9-FLATSCR-NEXT: ;;#ASMEND +; GFX9-FLATSCR-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: v_multiple_frame_indexes_literal_offsets: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0x44 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, 0x48, v1 :: v_dual_mov_b32 v1, 0 +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ; use v0, v1 +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: v_multiple_frame_indexes_literal_offsets: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0x44 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, 0x48, v1 :: v_dual_mov_b32 v1, 0 +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v0, v1 +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_endpgm %vgpr = call i32 @llvm.amdgcn.workitem.id.x() %alloca0 = alloca [17 x i32], align 8, addrspace(5) %alloca1 = alloca i32, align 4, addrspace(5) @@ -507,6 +2481,69 @@ define amdgpu_kernel void @v_multiple_frame_indexes_literal_offsets() #0 { ; GCN: v_mov_b32_e32 [[ALLOCA0:v[0-9]+]], 0{{$}} ; GCN: ; use [[SELECT]], [[ALLOCA0]] define amdgpu_kernel void @v_multiple_frame_indexes_one_imm_one_literal_offset() #0 { +; CI-LABEL: v_multiple_frame_indexes_one_imm_one_literal_offset: +; CI: ; %bb.0: +; CI-NEXT: s_add_u32 s0, s0, s17 +; CI-NEXT: v_mov_b32_e32 v1, 0x44 +; CI-NEXT: v_mov_b32_e32 v2, 64 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: ;;#ASMSTART +; CI-NEXT: ; use v0, v1 +; CI-NEXT: ;;#ASMEND +; CI-NEXT: s_endpgm +; +; GFX9-MUBUF-LABEL: v_multiple_frame_indexes_one_imm_one_literal_offset: +; GFX9-MUBUF: ; %bb.0: +; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s17 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v1, 0x44 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 64 +; GFX9-MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-MUBUF-NEXT: ;;#ASMSTART +; GFX9-MUBUF-NEXT: ; use v0, v1 +; GFX9-MUBUF-NEXT: ;;#ASMEND +; GFX9-MUBUF-NEXT: s_endpgm +; +; GFX9-FLATSCR-LABEL: v_multiple_frame_indexes_one_imm_one_literal_offset: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x44 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 64 +; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GFX9-FLATSCR-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-FLATSCR-NEXT: ;;#ASMSTART +; GFX9-FLATSCR-NEXT: ; use v0, v1 +; GFX9-FLATSCR-NEXT: ;;#ASMEND +; GFX9-FLATSCR-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: v_multiple_frame_indexes_one_imm_one_literal_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 64 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, 0x44, v1 :: v_dual_mov_b32 v1, 0 +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ; use v0, v1 +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: v_multiple_frame_indexes_one_imm_one_literal_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 64 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, 0x44, v1 :: v_dual_mov_b32 v1, 0 +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v0, v1 +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_endpgm %vgpr = call i32 @llvm.amdgcn.workitem.id.x() %alloca0 = alloca [16 x i32], align 8, addrspace(5) %alloca1 = alloca i32, align 4, addrspace(5) @@ -525,6 +2562,69 @@ define amdgpu_kernel void @v_multiple_frame_indexes_one_imm_one_literal_offset() ; GCN: v_mov_b32_e32 [[ALLOCA0:v[0-9]+]], 0{{$}} ; GCN: ; use [[SELECT]], [[ALLOCA0]] define amdgpu_kernel void @v_multiple_frame_indexes_imm_offsets() #0 { +; CI-LABEL: v_multiple_frame_indexes_imm_offsets: +; CI: ; %bb.0: +; CI-NEXT: s_add_u32 s0, s0, s17 +; CI-NEXT: v_mov_b32_e32 v1, 12 +; CI-NEXT: v_mov_b32_e32 v2, 8 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: ;;#ASMSTART +; CI-NEXT: ; use v0, v1 +; CI-NEXT: ;;#ASMEND +; CI-NEXT: s_endpgm +; +; GFX9-MUBUF-LABEL: v_multiple_frame_indexes_imm_offsets: +; GFX9-MUBUF: ; %bb.0: +; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s17 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v1, 12 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 8 +; GFX9-MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-MUBUF-NEXT: ;;#ASMSTART +; GFX9-MUBUF-NEXT: ; use v0, v1 +; GFX9-MUBUF-NEXT: ;;#ASMEND +; GFX9-MUBUF-NEXT: s_endpgm +; +; GFX9-FLATSCR-LABEL: v_multiple_frame_indexes_imm_offsets: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 8 +; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GFX9-FLATSCR-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-FLATSCR-NEXT: ;;#ASMSTART +; GFX9-FLATSCR-NEXT: ; use v0, v1 +; GFX9-FLATSCR-NEXT: ;;#ASMEND +; GFX9-FLATSCR-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: v_multiple_frame_indexes_imm_offsets: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 8 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, 12, v1 :: v_dual_mov_b32 v1, 0 +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ; use v0, v1 +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: v_multiple_frame_indexes_imm_offsets: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 8 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, 12, v1 :: v_dual_mov_b32 v1, 0 +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v0, v1 +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_endpgm %vgpr = call i32 @llvm.amdgcn.workitem.id.x() %alloca0 = alloca [2 x i32], align 8, addrspace(5) %alloca1 = alloca i32, align 4, addrspace(5) @@ -536,3 +2636,7 @@ define amdgpu_kernel void @v_multiple_frame_indexes_imm_offsets() #0 { } attributes #0 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; GFX9: {{.*}} +; MUBUF: {{.*}} diff --git a/llvm/test/CodeGen/ARM/cmp-peephole.ll b/llvm/test/CodeGen/ARM/cmp-peephole.ll index 73888558e6647..54a34e3f6077d 100644 --- a/llvm/test/CodeGen/ARM/cmp-peephole.ll +++ b/llvm/test/CodeGen/ARM/cmp-peephole.ll @@ -137,23 +137,17 @@ define i1 @cmp_ne_zero_or_rr(i32 %a, i32 %b) { define i1 @cmp_ne_zero_or_ri(i32 %a) { ; ARM-LABEL: cmp_ne_zero_or_ri: ; ARM: @ %bb.0: -; ARM-NEXT: orrs r0, r0, #42 -; ARM-NEXT: movwne r0, #1 +; ARM-NEXT: mov r0, #1 ; ARM-NEXT: bx lr ; ; THUMB-LABEL: cmp_ne_zero_or_ri: ; THUMB: @ %bb.0: -; THUMB-NEXT: movs r1, #42 -; THUMB-NEXT: orrs r0, r1 -; THUMB-NEXT: subs r1, r0, #1 -; THUMB-NEXT: sbcs r0, r1 +; THUMB-NEXT: movs r0, #1 ; THUMB-NEXT: bx lr ; ; THUMB2-LABEL: cmp_ne_zero_or_ri: ; THUMB2: @ %bb.0: -; THUMB2-NEXT: orrs r0, r0, #42 -; THUMB2-NEXT: it ne -; THUMB2-NEXT: movne r0, #1 +; THUMB2-NEXT: movs r0, #1 ; THUMB2-NEXT: bx lr %or = or i32 %a, 42 %res = icmp ne i32 %or, 0 @@ -726,10 +720,7 @@ define i1 @cmp_eq_zero_or_ri(i32 %a) { ; ; THUMB-LABEL: cmp_eq_zero_or_ri: ; THUMB: @ %bb.0: -; THUMB-NEXT: movs r1, #42 -; THUMB-NEXT: orrs r0, r1 -; THUMB-NEXT: rsbs r1, r0, #0 -; THUMB-NEXT: adcs r0, r1 +; THUMB-NEXT: movs r0, #0 ; THUMB-NEXT: bx lr ; ; THUMB2-LABEL: cmp_eq_zero_or_ri: diff --git a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll index a8421ae9a6a89..77dd121b5e797 100644 --- a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll @@ -1067,23 +1067,10 @@ define i1 @scalar_i8_signbit_eq_with_nonzero(i8 %x, i8 %y) nounwind { ; ARM-NEXT: mov r0, #0 ; ARM-NEXT: bx lr ; -; THUMB6-LABEL: scalar_i8_signbit_eq_with_nonzero: -; THUMB6: @ %bb.0: -; THUMB6-NEXT: uxtb r1, r1 -; THUMB6-NEXT: movs r2, #127 -; THUMB6-NEXT: mvns r2, r2 -; THUMB6-NEXT: lsls r2, r1 -; THUMB6-NEXT: ands r2, r0 -; THUMB6-NEXT: uxtb r0, r2 -; THUMB6-NEXT: subs r1, r0, #1 -; THUMB6-NEXT: rsbs r0, r1, #0 -; THUMB6-NEXT: adcs r0, r1 -; THUMB6-NEXT: bx lr -; -; THUMB78-LABEL: scalar_i8_signbit_eq_with_nonzero: -; THUMB78: @ %bb.0: -; THUMB78-NEXT: movs r0, #0 -; THUMB78-NEXT: bx lr +; THUMB-LABEL: scalar_i8_signbit_eq_with_nonzero: +; THUMB: @ %bb.0: +; THUMB-NEXT: movs r0, #0 +; THUMB-NEXT: bx lr %t0 = shl i8 128, %y %t1 = and i8 %t0, %x %res = icmp eq i8 %t1, 1 ; should be comparing with 0 diff --git a/llvm/test/CodeGen/ARM/select-imm.ll b/llvm/test/CodeGen/ARM/select-imm.ll index 186276b50ceeb..3b8170aa16866 100644 --- a/llvm/test/CodeGen/ARM/select-imm.ll +++ b/llvm/test/CodeGen/ARM/select-imm.ll @@ -652,13 +652,11 @@ define i1 @t10() { ; V8MBASE-NEXT: .pad #8 ; V8MBASE-NEXT: sub sp, #8 ; V8MBASE-NEXT: movs r0, #7 -; V8MBASE-NEXT: mvns r1, r0 -; V8MBASE-NEXT: str r1, [sp] -; V8MBASE-NEXT: adds r0, r1, #5 +; V8MBASE-NEXT: mvns r0, r0 +; V8MBASE-NEXT: str r0, [sp] +; V8MBASE-NEXT: adds r0, r0, #5 ; V8MBASE-NEXT: str r0, [sp, #4] -; V8MBASE-NEXT: adds r1, #8 -; V8MBASE-NEXT: rsbs r0, r1, #0 -; V8MBASE-NEXT: adcs r0, r1 +; V8MBASE-NEXT: movs r0, #1 ; V8MBASE-NEXT: add sp, #8 ; V8MBASE-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll index 559bb68741e12..a0a0a19c0322f 100644 --- a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll +++ b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll @@ -10,35 +10,10 @@ define i32 @fred(ptr %a0) #0 { ; CHECK-LABEL: fred: ; CHECK: // %bb.0: // %b0 ; CHECK-NEXT: { -; CHECK-NEXT: if (p0) jump:nt .LBB0_2 -; CHECK-NEXT: } -; CHECK-NEXT: // %bb.1: // %b2 -; CHECK-NEXT: { -; CHECK-NEXT: r3:2 = combine(#0,#0) -; CHECK-NEXT: r1:0 = memd(r0+#0) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: p0 = vcmph.eq(r1:0,r3:2) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: r1:0 = mask(p0) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: r0 = and(r0,#1) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: p0 = cmp.eq(r0,#11) -; CHECK-NEXT: r0 = #1 -; CHECK-NEXT: } -; CHECK-NEXT: { +; CHECK-NEXT: if (!p0) r0 = #1 ; CHECK-NEXT: if (p0) r0 = #0 ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: } -; CHECK-NEXT: .LBB0_2: // %b14 -; CHECK-NEXT: { -; CHECK-NEXT: r0 = #0 -; CHECK-NEXT: jumpr r31 -; CHECK-NEXT: } b0: switch i32 undef, label %b14 [ i32 5, label %b2 diff --git a/llvm/test/CodeGen/LoongArch/sextw-removal.ll b/llvm/test/CodeGen/LoongArch/sextw-removal.ll index 0c31ff9eee1f2..b0aebd9f39e9d 100644 --- a/llvm/test/CodeGen/LoongArch/sextw-removal.ll +++ b/llvm/test/CodeGen/LoongArch/sextw-removal.ll @@ -326,11 +326,13 @@ declare i64 @llvm.ctpop.i64(i64) define void @test8(i32 signext %arg, i32 signext %arg1) nounwind { ; CHECK-LABEL: test8: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: addi.d $sp, $sp, -16 -; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $sp, $sp, -32 +; CHECK-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s0, $sp, 8 # 8-byte Folded Spill ; CHECK-NEXT: sra.w $a0, $a0, $a1 -; CHECK-NEXT: addi.w $fp, $zero, -256 +; CHECK-NEXT: addi.d $fp, $zero, -256 +; CHECK-NEXT: ori $s0, $zero, 1 ; CHECK-NEXT: .p2align 4, , 16 ; CHECK-NEXT: .LBB7_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -338,20 +340,23 @@ define void @test8(i32 signext %arg, i32 signext %arg1) nounwind { ; CHECK-NEXT: pcaddu18i $ra, %call36(foo) ; CHECK-NEXT: jirl $ra, $ra, 0 ; CHECK-NEXT: or $a0, $a0, $fp -; CHECK-NEXT: bnez $a0, .LBB7_1 +; CHECK-NEXT: bnez $s0, .LBB7_1 ; CHECK-NEXT: # %bb.2: # %bb7 -; CHECK-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 16 +; CHECK-NEXT: ld.d $s0, $sp, 8 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 32 ; CHECK-NEXT: ret ; ; NORMV-LABEL: test8: ; NORMV: # %bb.0: # %bb -; NORMV-NEXT: addi.d $sp, $sp, -16 -; NORMV-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; NORMV-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; NORMV-NEXT: addi.d $sp, $sp, -32 +; NORMV-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; NORMV-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill +; NORMV-NEXT: st.d $s0, $sp, 8 # 8-byte Folded Spill ; NORMV-NEXT: sra.w $a0, $a0, $a1 -; NORMV-NEXT: addi.w $fp, $zero, -256 +; NORMV-NEXT: addi.d $fp, $zero, -256 +; NORMV-NEXT: ori $s0, $zero, 1 ; NORMV-NEXT: .p2align 4, , 16 ; NORMV-NEXT: .LBB7_1: # %bb2 ; NORMV-NEXT: # =>This Inner Loop Header: Depth=1 @@ -359,11 +364,12 @@ define void @test8(i32 signext %arg, i32 signext %arg1) nounwind { ; NORMV-NEXT: pcaddu18i $ra, %call36(foo) ; NORMV-NEXT: jirl $ra, $ra, 0 ; NORMV-NEXT: or $a0, $a0, $fp -; NORMV-NEXT: bnez $a0, .LBB7_1 +; NORMV-NEXT: bnez $s0, .LBB7_1 ; NORMV-NEXT: # %bb.2: # %bb7 -; NORMV-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload -; NORMV-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload -; NORMV-NEXT: addi.d $sp, $sp, 16 +; NORMV-NEXT: ld.d $s0, $sp, 8 # 8-byte Folded Reload +; NORMV-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; NORMV-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; NORMV-NEXT: addi.d $sp, $sp, 32 ; NORMV-NEXT: ret bb: %i = ashr i32 %arg, %arg1 diff --git a/llvm/test/CodeGen/RISCV/pr64935.ll b/llvm/test/CodeGen/RISCV/pr64935.ll index 60be5fa6c994e..b712db0dc99d6 100644 --- a/llvm/test/CodeGen/RISCV/pr64935.ll +++ b/llvm/test/CodeGen/RISCV/pr64935.ll @@ -4,10 +4,7 @@ define i1 @f() { ; CHECK-LABEL: f: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 524288 -; CHECK-NEXT: not a0, a0 -; CHECK-NEXT: sltiu a0, a0, 2 -; CHECK-NEXT: xori a0, a0, 1 +; CHECK-NEXT: li a0, 1 ; CHECK-NEXT: ret %B25 = shl i64 4294967296, -9223372036854775808 %B13 = sub i64 -1, -9223372036854775808 diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll index b155feab9b4d9..f048d67fa37f7 100644 --- a/llvm/test/CodeGen/RISCV/sextw-removal.ll +++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll @@ -422,7 +422,7 @@ define void @test8(i32 signext %arg, i32 signext %arg1) nounwind { ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: call foo ; CHECK-NEXT: ori a0, a0, -256 -; CHECK-NEXT: bnez a0, .LBB7_1 +; CHECK-NEXT: j .LBB7_1 ; CHECK-NEXT: # %bb.2: # %bb7 ; CHECK-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; CHECK-NEXT: addi sp, sp, 16 @@ -438,7 +438,7 @@ define void @test8(i32 signext %arg, i32 signext %arg1) nounwind { ; NOREMOVAL-NEXT: sext.w a0, a0 ; NOREMOVAL-NEXT: call foo ; NOREMOVAL-NEXT: ori a0, a0, -256 -; NOREMOVAL-NEXT: bnez a0, .LBB7_1 +; NOREMOVAL-NEXT: j .LBB7_1 ; NOREMOVAL-NEXT: # %bb.2: # %bb7 ; NOREMOVAL-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; NOREMOVAL-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/X86/2007-10-12-CoalesceExtSubReg.ll b/llvm/test/CodeGen/X86/2007-10-12-CoalesceExtSubReg.ll index cfb3e508576dd..9c7b76f5ec3ac 100644 --- a/llvm/test/CodeGen/X86/2007-10-12-CoalesceExtSubReg.ll +++ b/llvm/test/CodeGen/X86/2007-10-12-CoalesceExtSubReg.ll @@ -4,25 +4,18 @@ define signext i16 @f(ptr %bp, ptr %ss) { ; CHECK-LABEL: f: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushl %esi -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: .cfi_offset %esi, -8 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movb $1, %cl ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %cond_next127 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: movl (%eax), %edx -; CHECK-NEXT: movl (%ecx), %esi ; CHECK-NEXT: andl $15, %edx -; CHECK-NEXT: andl $15, %esi -; CHECK-NEXT: addl %esi, (%ecx) -; CHECK-NEXT: cmpl $63, %edx -; CHECK-NEXT: jb .LBB0_1 +; CHECK-NEXT: addl %edx, (%eax) +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.2: # %UnifiedReturnBlock ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: popl %esi -; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl entry: br label %cond_next127 diff --git a/llvm/test/CodeGen/X86/apx/or.ll b/llvm/test/CodeGen/X86/apx/or.ll index 514a7d83b78b0..d75d4225d2880 100644 --- a/llvm/test/CodeGen/X86/apx/or.ll +++ b/llvm/test/CodeGen/X86/apx/or.ll @@ -621,18 +621,18 @@ define i1 @orflag64rm(ptr %ptr, i64 %b) { define i1 @orflag8ri(i8 %a) { ; CHECK-LABEL: orflag8ri: ; CHECK: # %bb.0: -; CHECK-NEXT: orb $-124, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xcf,0x84] -; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] +; CHECK-NEXT: orb $-124, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xcf,0x84] +; CHECK-NEXT: movb %al, d64(%rip) # encoding: [0x88,0x05,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte +; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: orflag8ri: ; NF: # %bb.0: -; NF-NEXT: orb $-124, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xcf,0x84] -; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] +; NF-NEXT: {nf} orb $-124, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0xcf,0x84] +; NF-NEXT: movb %al, d64(%rip) # encoding: [0x88,0x05,A,A,A,A] ; NF-NEXT: # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] ; NF-NEXT: retq # encoding: [0xc3] %xor = xor i8 123, -1 %v0 = or i8 %a, %xor ; 0xff << 50 @@ -644,20 +644,20 @@ define i1 @orflag8ri(i8 %a) { define i1 @orflag16ri(i16 %a) { ; CHECK-LABEL: orflag16ri: ; CHECK: # %bb.0: -; CHECK-NEXT: orw $-1235, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xcf,0x2d,0xfb] +; CHECK-NEXT: orw $-1235, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0xcf,0x2d,0xfb] ; CHECK-NEXT: # imm = 0xFB2D -; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movw %ax, d64(%rip) # encoding: [0x66,0x89,0x05,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte +; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: orflag16ri: ; NF: # %bb.0: -; NF-NEXT: orw $-1235, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xcf,0x2d,0xfb] +; NF-NEXT: {nf} orw $-1235, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x81,0xcf,0x2d,0xfb] ; NF-NEXT: # imm = 0xFB2D -; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] +; NF-NEXT: movw %ax, d64(%rip) # encoding: [0x66,0x89,0x05,A,A,A,A] ; NF-NEXT: # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] ; NF-NEXT: retq # encoding: [0xc3] %xor = xor i16 1234, -1 %v0 = or i16 %a, %xor ; 0xff << 50 @@ -671,18 +671,18 @@ define i1 @orflag32ri(i32 %a) { ; CHECK: # %bb.0: ; CHECK-NEXT: orl $123456, %edi # EVEX TO LEGACY Compression encoding: [0x81,0xcf,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 -; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; CHECK-NEXT: movl %edi, d64(%rip) # encoding: [0x89,0x3d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte +; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: orflag32ri: ; NF: # %bb.0: ; NF-NEXT: orl $123456, %edi # EVEX TO LEGACY Compression encoding: [0x81,0xcf,0x40,0xe2,0x01,0x00] ; NF-NEXT: # imm = 0x1E240 -; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; NF-NEXT: movl %edi, d64(%rip) # encoding: [0x89,0x3d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] ; NF-NEXT: retq # encoding: [0xc3] %v0 = or i32 %a, 123456 ; 0xff << 50 %v1 = icmp eq i32 %v0, 0 @@ -695,18 +695,18 @@ define i1 @orflag64ri(i64 %a) { ; CHECK: # %bb.0: ; CHECK-NEXT: orq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xcf,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 -; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; CHECK-NEXT: movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte +; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: orflag64ri: ; NF: # %bb.0: ; NF-NEXT: orq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xcf,0x40,0xe2,0x01,0x00] ; NF-NEXT: # imm = 0x1E240 -; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; NF-NEXT: movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] ; NF-NEXT: retq # encoding: [0xc3] %v0 = or i64 %a, 123456 ; 0xff << 50 %v1 = icmp eq i64 %v0, 0 @@ -717,18 +717,18 @@ define i1 @orflag64ri(i64 %a) { define i1 @orflag16ri8(i16 %a) { ; CHECK-LABEL: orflag16ri8: ; CHECK: # %bb.0: -; CHECK-NEXT: orw $-124, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xcf,0x84] -; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] +; CHECK-NEXT: orw $-124, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0xcf,0x84] +; CHECK-NEXT: movw %ax, d64(%rip) # encoding: [0x66,0x89,0x05,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte +; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: orflag16ri8: ; NF: # %bb.0: -; NF-NEXT: orw $-124, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xcf,0x84] -; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] +; NF-NEXT: {nf} orw $-124, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x83,0xcf,0x84] +; NF-NEXT: movw %ax, d64(%rip) # encoding: [0x66,0x89,0x05,A,A,A,A] ; NF-NEXT: # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] ; NF-NEXT: retq # encoding: [0xc3] %xor = xor i16 123, -1 %v0 = or i16 %a, %xor ; 0xff << 50 @@ -741,17 +741,17 @@ define i1 @orflag32ri8(i32 %a) { ; CHECK-LABEL: orflag32ri8: ; CHECK: # %bb.0: ; CHECK-NEXT: orl $123, %edi # EVEX TO LEGACY Compression encoding: [0x83,0xcf,0x7b] -; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; CHECK-NEXT: movl %edi, d64(%rip) # encoding: [0x89,0x3d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte +; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: orflag32ri8: ; NF: # %bb.0: ; NF-NEXT: orl $123, %edi # EVEX TO LEGACY Compression encoding: [0x83,0xcf,0x7b] -; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; NF-NEXT: movl %edi, d64(%rip) # encoding: [0x89,0x3d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] ; NF-NEXT: retq # encoding: [0xc3] %v0 = or i32 %a, 123 ; 0xff << 50 %v1 = icmp eq i32 %v0, 0 @@ -763,17 +763,17 @@ define i1 @orflag64ri8(i64 %a) { ; CHECK-LABEL: orflag64ri8: ; CHECK: # %bb.0: ; CHECK-NEXT: orq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xcf,0x7b] -; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; CHECK-NEXT: movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte +; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: orflag64ri8: ; NF: # %bb.0: ; NF-NEXT: orq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xcf,0x7b] -; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; NF-NEXT: movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte +; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] ; NF-NEXT: retq # encoding: [0xc3] %v0 = or i64 %a, 123 ; 0xff << 50 %v1 = icmp eq i64 %v0, 0